Changeset 8e4aa05

.gitignore

r342af53	r8e4aa05
79	79	# generated by npm
80	80	package-lock.json
	81
	82	# generated by benchmark
	83	benchmark/Cargo.toml

Jenkins/FullBuild

-              r342af53
+              r8e4aa05
                                         gcc_7_x86_old: { trigger_build( 'gcc-7',   'x86', false ) },
                                         gcc_6_x86_old: { trigger_build( 'gcc-6',   'x86', false ) },
+                                        gcc_9_x64_old: { trigger_build( 'gcc-9',   'x64', false ) },
+                                        gcc_8_x64_old: { trigger_build( 'gcc-8',   'x64', false ) },
+                                        gcc_7_x64_old: { trigger_build( 'gcc-7',   'x64', false ) },
+                                        gcc_6_x64_old: { trigger_build( 'gcc-6',   'x64', false ) },
+                                        gcc_5_x64_old: { trigger_build( 'gcc-5',   'x64', false ) },
+                                        gcc_9_x64_new: { trigger_build( 'gcc-9',   'x64', true  ) },
+                                        gcc_8_x64_new: { trigger_build( 'gcc-8',   'x64', true  ) },
+                                        gcc_7_x64_new: { trigger_build( 'gcc-7',   'x64', true  ) },
+                                        gcc_6_x64_new: { trigger_build( 'gcc-6',   'x64', true  ) },
+                                        gcc_5_x64_new: { trigger_build( 'gcc-5',   'x64', true  ) },
+                                        clang_x64_new: { trigger_build( 'clang',   'x64', true  ) },
                                         clang_x64_old: { trigger_build( 'clang',   'x64', false ) },
-                                        clang_x64_new: { trigger_build( 'clang',   'x64', true  ) },
+                                )
+                        }
 …
 def trigger_build(String cc, String arch, boolean new_ast) {
+        // Randomly delay the builds by a random amount to avoid hitting the SC server to hard
+        sleep(time: 5 * Math.random(), unit:"MINUTES")
+        // Run the build
+        // Don't propagate, it doesn't play nice with our email setup
         def result = build job: 'Cforall/master',               \
                 parameters: [                                           \

benchmark/Makefile.am

r342af53	r8e4aa05
502	502
503	503	compile-io$(EXEEXT):
504		$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/io1.cfa
	504	$(CFACOMPILE) -DNO_COMPILED_PRAGMA -fsyntax-only -w $(testdir)/io/io.cfa
505	505
506	506	compile-monitor$(EXEEXT):

benchmark/io/http/filecache.cfa

-              r342af53
+              r8e4aa05
 #include <string.h>
+#include <fstream.hfa>
 #include <stdlib.hfa>
 …
                 conflicts += put_file( raw[i], fd );
+        }
         printf("Filled cache from path \"%s\" with %zu files\n", path, fcount);
+        sout | "Filled cache from path \"" | path | "\" with" | fcount | "files";
         if( conflicts > 0 ) {
                 printf("Found %d conflicts (seed: %u)\n", conflicts, options.file_cache.hash_seed);
+                sout | "Found" | conflicts | "conflicts (seed: " | options.file_cache.hash_seed | ")";
                 #if defined(REJECT_CONFLICTS)
                         abort("Conflicts found in the cache");
 …
         if(options.file_cache.list) {
                 printf("Listing files and exiting\n");
+                sout | "Listing files and exiting";
                 for(i; fcount) {
                         int s; char u;
                         [s, u] = human_size(raw[i].size);
                         printf("%4d%c - %s\n", s, u, raw[i].file);
+                        sout | s | u | "-" | raw[i].file;
                         free(raw[i].file);
+                }
 …
 [int *, int] filefds(int extra) {
+        if(!options.file_cache.path) {
+                int * data = alloc(extra);
+                return [data, 0];
+        }
         if(!file_cache.entries) {
                 abort("File cache not filled!\n");

benchmark/io/http/main.cfa

-              r342af53
+              r8e4aa05
 #include <unistd.h>
 extern "C" {
+        #include <signal.h>
         #include <sys/socket.h>
         #include <netinet/in.h>
+}
+#include <fstream.hfa>
 #include <kernel.hfa>
+#include <iofwd.hfa>
 #include <stats.hfa>
 #include <time.hfa>
 …
 //=============================================================================================
+// Stats Printer
+//============================================================================================='
+thread StatsPrinter {};
+void ?{}( StatsPrinter & this, cluster & cl ) {
+        ((thread&)this){ "Stats Printer Thread", cl };
+}
+void ^?{}( StatsPrinter & mutex this ) {}
+void main(StatsPrinter & this) {
+        LOOP: for() {
+                waitfor( ^?{} : this) {
+                        break LOOP;
+                }
+                or else {}
+                sleep(10`s);
+                print_stats_now( *active_cluster(), CFA_STATS_READY_Q | CFA_STATS_IO );
+        }
+}
+//=============================================================================================
 // Globals
 //=============================================================================================
+struct ServerProc {
+        processor self;
+struct ServerCluster {
+        cluster self;
+        processor    * procs;
+        // io_context   * ctxs;
+        StatsPrinter * prnt;
 };
+void ?{}( ServerProc & this ) {
+        /* paranoid */ assert( options.clopts.instance != 0p );
+        (this.self){ "Benchmark Processor", *options.clopts.instance };
+void ?{}( ServerCluster & this ) {
+        (this.self){ "Server Cluster", options.clopts.params };
+        this.procs = alloc(options.clopts.nprocs);
+        for(i; options.clopts.nprocs) {
+                (this.procs[i]){ "Benchmark Processor", this.self };
+                #if !defined(__CFA_NO_STATISTICS__)
+                        if( options.clopts.procstats ) {
+                                print_stats_at_exit( *this.procs, this.self.print_stats );
+                        }
+                        if( options.clopts.viewhalts ) {
+                                print_halts( *this.procs );
+                        }
+                #endif
+        }
+        if(options.stats) {
+                this.prnt = alloc();
+                (*this.prnt){ this.self };
+        } else {
+                this.prnt = 0p;
+        }
         #if !defined(__CFA_NO_STATISTICS__)
+                if( options.clopts.procstats ) {
+                        print_stats_at_exit( this.self, options.clopts.instance->print_stats );
+                }
+                if( options.clopts.viewhalts ) {
+                        print_halts( this.self );
+                }
+                print_stats_at_exit( this.self, CFA_STATS_READY_Q | CFA_STATS_IO );
         #endif
+        options.clopts.instance[options.clopts.cltr_cnt] = &this.self;
+        options.clopts.cltr_cnt++;
+}
+void ^?{}( ServerCluster & this ) {
+        delete(this.prnt);
+        for(i; options.clopts.nprocs) {
+                ^(this.procs[i]){};
+        }
+        free(this.procs);
+        ^(this.self){};
+}
 …
 //============================================================================================='
 int main( int argc, char * argv[] ) {
+        __sighandler_t s = 1p;
+        signal(SIGPIPE, s);
         //===================
         // Parse args
         const char * path = parse_options(argc, argv);
+        parse_options(argc, argv);
         //===================
         // Open Files
+        printf("Filling cache from %s\n", path);
+        fill_cache( path );
+        if( options.file_cache.path ) {
+                sout | "Filling cache from" | options.file_cache.path;
+                fill_cache( options.file_cache.path );
+        }
         //===================
         // Open Socket
         printf("%ld : Listening on port %d\n", getpid(), options.socket.port);
+        sout | getpid() | ": Listening on port" | options.socket.port;
         int server_fd = socket(AF_INET, SOCK_STREAM, 0);
         if(server_fd < 0) {
 …
                         if(errno == EADDRINUSE) {
                                 if(waited == 0) {
                                         printf("Waiting for port\n");
+                                        sout | "Waiting for port";
                                 } else {
                                         printf("\r%d", waited);
                                         fflush(stdout);
+                                        sout | "\r" | waited | nonl;
+                                        flush( sout );
+                                }
                                 waited ++;
 …
         // Run Server Cluster
+        {
-                cluster cl = { "Server Cluster", options.clopts.params };
-                #if !defined(__CFA_NO_STATISTICS__)
-                        print_stats_at_exit( cl, CFA_STATS_READY_Q | CFA_STATS_IO );
-                #endif
-                options.clopts.instance = &cl;
                 int pipe_cnt = options.clopts.nworkers * 2;
                 int pipe_off;
 …
+                }
                 if(options.file_cache.fixed_fds) {
                         register_fixed_files(cl, fds, pipe_off);
+                }
+                // if(options.file_cache.path && options.file_cache.fixed_fds) {
+                //      register_fixed_files(cl, fds, pipe_off);
+                // }
+                {
                         ServerProc procs[options.clopts.nprocs];
+                        ServerCluster cl[options.clopts.nclusters];
                         init_protocol();
 …
                                         unpark( workers[i] );
+                                }
+                                printf("%d workers started on %d processors\n", options.clopts.nworkers, options.clopts.nprocs);
+                                sout | options.clopts.nworkers | "workers started on" | options.clopts.nprocs | "processors /" | options.clopts.nclusters | "clusters";
+                                for(i; options.clopts.nclusters) {
+                                        sout | options.clopts.thrd_cnt[i] | nonl;
+                                }
+                                sout | nl;
+                                {
                                         char buffer[128];
+                                        while(!feof(stdin)) {
+                                                fgets(buffer, 128, stdin);
+                                        for() {
+                                                int ret = cfa_read(0, buffer, 128, 0);
+                                                if(ret == 0) break;
+                                                if(ret < 0) abort( "main read error: (%d) %s\n", (int)errno, strerror(errno) );
+                                                sout | "User wrote '" | "" | nonl;
+                                                write(sout, buffer, ret - 1);
+                                                sout | "'";
+                                        }
+                                        printf("Shutting Down\n");
+                                }
+                                        sout | "Shutdown received";
+                                }
+                                sout | "Notifying connections..." | nonl; flush( sout );
                                 for(i; options.clopts.nworkers) {
-                                        printf("Cancelling %p\n", (void*)workers[i].cancel.target);
                                         workers[i].done = true;
                                         cancel(workers[i].cancel);
+                                }
                                 printf("Shutting down socket\n");
+                                }
+                                sout | "done";
+                                sout | "Shutting down socket..." | nonl; flush( sout );
                                 int ret = shutdown( server_fd, SHUT_RD );
+                                if( ret < 0 ) { abort( "shutdown error: (%d) %s\n", (int)errno, strerror(errno) ); }
+                                if( ret < 0 ) {
+                                        abort( "shutdown error: (%d) %s\n", (int)errno, strerror(errno) );
+                                }
+                                sout | "done";
                                 //===================
                                 // Close Socket
                                 printf("Closing Socket\n");
+                                sout | "Closing Socket..." | nonl; flush( sout );
                                 ret = close( server_fd );
                                 if(ret < 0) {
                                         abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
+                                }
+                        }
+                        printf("Workers Closed\n");
+                                sout | "done";
+                                sout | "Stopping connection threads..." | nonl; flush( sout );
+                        }
+                        sout | "done";
+                        sout | "Stopping protocol threads..." | nonl; flush( sout );
                         deinit_protocol();
+                }
+                        sout | "done";
+                        sout | "Stopping processors/clusters..." | nonl; flush( sout );
+                }
+                sout | "done";
+                sout | "Closing splice fds..." | nonl; flush( sout );
                 for(i; pipe_cnt) {
                         ret = close( fds[pipe_off + i] );
 …
+                }
                 free(fds);
+        }
+                sout | "done";
+                sout | "Stopping processors..." | nonl; flush( sout );
+        }
+        sout | "done";
         //===================
         // Close Files
+        printf("Closing Files\n");
+        close_cache();
+}
+        if( options.file_cache.path ) {
+                sout | "Closing open files..." | nonl; flush( sout );
+                close_cache();
+                sout | "done";
+        }
+}

benchmark/io/http/options.cfa

-              r342af53
+              r8e4aa05
+}
+#include <bitmanip.hfa>
+#include <fstream.hfa>
 #include <kernel.hfa>
 #include <parseargs.hfa>
+#include <stdlib.hfa>
+#include <stdlib.h>
 #include <string.h>
 Options options @= {
         false, // log
+        false, // stats
         { // file_cache
+,     // path
 ,     // open_flags;
 u,   // hash_seed;
 …
         { // cluster
+,     // nclusters;
 ,     // nprocs;
 ,     // nworkers;
 ,     // flags;
+                {},     // params;
                 false, // procstats
                 false, // viewhalts
 …
 };
 const char * parse_options( int argc, char * argv[] ) {
         bool subthrd = false;
         bool eagrsub = false;
         bool fixedfd = false;
         bool sqkpoll = false;
         bool iokpoll = false;
+        unsigned sublen = 16;
+void parse_options( int argc, char * argv[] ) {
+        // bool fixedfd = false;
+        // bool sqkpoll = false;
+        // bool iokpoll = false;
+        unsigned nentries = 16;
+        bool isolate = false;
         static cfa_option opt[] = {
                 {'p', "port",           "Port the server will listen on", options.socket.port},
                 {'c', "cpus",           "Number of processors to use", options.clopts.nprocs},
                 {'L', "log",            "Enable logs", options.log, parse_settrue},
                 {'t', "threads",        "Number of worker threads to use", options.clopts.nworkers},
                 {'b', "accept-backlog", "Maximum number of pending accepts", options.socket.backlog},
                 {'r', "request_len",    "Maximum number of bytes in the http request, requests with more data will be answered with Http Code 414", options.socket.buflen},
                 {'S', "seed",           "seed to use for hashing", options.file_cache.hash_seed },
                 {'C', "cache-size",     "Size of the cache to use, if set to small, will uses closes power of 2", options.file_cache.size },
                 {'l', "list-files",     "List the files in the specified path and exit", options.file_cache.list, parse_settrue },
                 {'s', "submitthread",   "If set, cluster uses polling thread to submit I/O", subthrd, parse_settrue },
                 {'e', "eagersubmit",    "If set, cluster submits I/O eagerly but still aggregates submits", eagrsub, parse_settrue},
                 {'f', "fixed-fds",      "If set, files are open eagerly and pre-registered with the cluster", fixedfd, parse_settrue},
                 {'k', "kpollsubmit",    "If set, cluster uses IORING_SETUP_SQPOLL, implies -f", sqkpoll, parse_settrue },
                 {'i', "kpollcomplete",  "If set, cluster uses IORING_SETUP_IOPOLL", iokpoll, parse_settrue },
                 {'L', "submitlength",   "Max number of submitions that can be submitted together", sublen },
+                { 'p', "port",           "Port the server will listen on", options.socket.port},
+                { 'c', "cpus",           "Number of processors to use", options.clopts.nprocs},
+                { 't', "threads",        "Number of worker threads to use", options.clopts.nworkers},
+                {'\0', "isolate",        "Create one cluster per processor", isolate, parse_settrue},
+                {'\0', "log",            "Enable logs", options.log, parse_settrue},
+                {'\0', "stats",          "Enable statistics", options.stats, parse_settrue},
+                {'\0', "accept-backlog", "Maximum number of pending accepts", options.socket.backlog},
+                {'\0', "request_len",    "Maximum number of bytes in the http request, requests with more data will be answered with Http Code 414", options.socket.buflen},
+                {'\0', "seed",           "seed to use for hashing", options.file_cache.hash_seed },
+                {'\0', "cache-size",     "Size of the cache to use, if set to small, will uses closes power of 2", options.file_cache.size },
+                {'\0', "list-files",     "List the files in the specified path and exit", options.file_cache.list, parse_settrue },
+                // { 'f', "fixed-fds",      "If set, files are open eagerly and pre-registered with the cluster", fixedfd, parse_settrue},
+                // { 'k', "kpollsubmit",    "If set, cluster uses IORING_SETUP_SQPOLL, implies -f", sqkpoll, parse_settrue },
+                // { 'i', "kpollcomplete",  "If set, cluster uses IORING_SETUP_IOPOLL", iokpoll, parse_settrue },
+                {'e', "numentries",     "Number of I/O entries", nentries },
         };
 …
         parse_args( argc, argv, opt, opt_cnt, "[OPTIONS]... [PATH]\ncforall http server", left );
+        options.clopts.params.poller_submits = subthrd;
+        options.clopts.params.eager_submits  = eagrsub;
+        if( fixedfd ) {
+                options.file_cache.fixed_fds = true;
+        if( !is_pow2(nentries) ) {
+                unsigned v = nentries;
+                v--;
+                v |= v >> 1;
+                v |= v >> 2;
+                v |= v >> 4;
+                v |= v >> 8;
+                v |= v >> 16;
+                v++;
+                serr | "Warning: num_entries not a power of 2" | '(' | nentries | ')' | "raising to " | v;
+                nentries = v;
+        }
+        if(isolate) {
+                options.clopts.nclusters = options.clopts.nprocs;
+                options.clopts.nprocs = 1;
+        }
+        options.clopts.params.num_entries = nentries;
+        options.clopts.instance = alloc(options.clopts.nclusters);
+        options.clopts.thrd_cnt = alloc(options.clopts.nclusters);
+        options.clopts.cltr_cnt = 0;
+        for(i; options.clopts.nclusters) {
+                options.clopts.thrd_cnt[i] = 0;
+        }
-        if( sqkpoll ) {
-                options.clopts.params.poll_submit = true;
-                options.file_cache.fixed_fds = true;
+        }
+        if( iokpoll ) {
+                options.clopts.params.poll_complete = true;
+                options.file_cache.open_flags |= O_DIRECT;
+        }
+        // if( fixedfd ) {
+        //      options.file_cache.fixed_fds = true;
+        // }
+        options.clopts.params.num_ready = sublen;
+        // if( sqkpoll ) {
+        //      options.file_cache.fixed_fds = true;
+        // }
+        if( left[0] == 0p ) { return "."; }
+        // if( iokpoll ) {
+        //      options.file_cache.open_flags |= O_DIRECT;
+        // }
+        if( left[0] == 0p ) { return; }
         const char * path = left[0];
 …
         if( left[0] != 0p ) {
+                abort("Too many trailing arguments!\n");
+                serr | "Too many trailing arguments!" | '\'' | path | '\'';
+                while(left[0] != 0p) {
+                        serr | " - " | left[0];
+                        left++;
+                }
+                exit(EXIT_FAILURE);
+        }
         return path;
+        options.file_cache.path = path;
+}

benchmark/io/http/options.hfa

-              r342af53
+              r8e4aa05
 struct Options {
         bool log;
+        bool stats;
         struct {
+                const char * path;
                 int open_flags;
                 uint32_t hash_seed;
 …
         struct {
+                int nclusters;
                 int nprocs;
                 int nworkers;
 …
                 bool procstats;
                 bool viewhalts;
+                cluster * instance;
+                cluster ** instance;
+                size_t   * thrd_cnt;
+                size_t     cltr_cnt;
         } clopts;
 };
 …
 extern Options options;
 const char * parse_options( int argc, char * argv[] );
+void parse_options( int argc, char * argv[] );

benchmark/io/http/protocol.cfa

-              r342af53
+              r8e4aa05
         #include <fcntl.h>
+}
+#include <fstream.hfa>
 #include <iofwd.hfa>
 …
 extern "C" {
       int snprintf ( char * s, size_t n, const char * format, ... );
         #include <linux/io_uring.h>
+        // #include <linux/io_uring.h>
+}
 #include <string.h>
 …
 #include "options.hfa"
+const char * volatile date = 0p;
+const char * http_msgs[] = {
+        "HTTP/1.1 200 OK\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: %zu \n\n",
         "HTTP/1.1 400 Bad Request\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
         "HTTP/1.1 404 Not Found\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 413 Payload Too Large\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 414 URI Too Long\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
 };
+#define PLAINTEXT_1WRITE
+#define PLAINTEXT_NOCOPY
+struct https_msg_str {
+        char msg[512];
+        size_t len;
+};
+const https_msg_str * volatile http_msgs[KNOWN_CODES] = { 0 };
 _Static_assert( KNOWN_CODES == (sizeof(http_msgs ) / sizeof(http_msgs [0])));
+const int http_codes[] = {
+const int http_codes[KNOWN_CODES] = {
+,
 ,
 ,
 ,
+,
+,
 ,
 ,
 …
         while(len > 0) {
                 // Call write
+                int ret = cfa_write(fd, it, len, 0, -1`s, 0p, 0p);
+                // int ret = write(fd, it, len);
+                if( ret < 0 ) { if( errno != EAGAIN && errno != EWOULDBLOCK) abort( "'answer error' error: (%d) %s\n", (int)errno, strerror(errno) ); }
+                int ret = cfa_send(fd, it, len, 0, CFA_IO_LAZY);
+                if( ret < 0 ) {
+                        if( errno == ECONNRESET || errno == EPIPE ) return -ECONNRESET;
+                        if( errno == EAGAIN || errno == EWOULDBLOCK) return -EAGAIN;
+                        abort( "'answer error' error: (%d) %s\n", (int)errno, strerror(errno) );
+                }
                 // update it/len
 …
         /* paranoid */ assert( code < KNOWN_CODES && code != OK200 );
         int idx = (int)code;
         return answer( fd, http_msgs[idx], strlen( http_msgs[idx] ) );
+        return answer( fd, http_msgs[idx]->msg, http_msgs[idx]->len );
+}
 int answer_header( int fd, size_t size ) {
+        const char * fmt = http_msgs[OK200];
+        int len = 200;
+        char buffer[len];
+        len = snprintf(buffer, len, fmt, date, size);
+        char buffer[512];
+        char * it = buffer;
+        memcpy(it, http_msgs[OK200]->msg, http_msgs[OK200]->len);
+        it += http_msgs[OK200]->len;
+        int len = http_msgs[OK200]->len;
+        len += snprintf(it, 512 - len, "%d \n\n", size);
         return answer( fd, buffer, len );
+}
+int answer_plain( int fd, char buffer[], size_t size ) {
+        int ret = answer_header(fd, size);
+#if defined(PLAINTEXT_NOCOPY)
+int answer_plaintext( int fd ) {
+        return answer(fd, http_msgs[OK200_PlainText]->msg, http_msgs[OK200_PlainText]->len + 1); // +1 cause snprintf doesn't count nullterminator
+}
+#elif defined(PLAINTEXT_1WRITE)
+int answer_plaintext( int fd ) {
+        char text[] = "Hello, World!\n";
+        char buffer[512 + sizeof(text)];
+        char * it = buffer;
+        memcpy(it, http_msgs[OK200]->msg, http_msgs[OK200]->len);
+        it += http_msgs[OK200]->len;
+        int len = http_msgs[OK200]->len;
+        int r = snprintf(it, 512 - len, "%d \n\n", sizeof(text));
+        it += r;
+        len += r;
+        memcpy(it, text, sizeof(text));
+        return answer(fd, buffer, len + sizeof(text));
+}
+#else
+int answer_plaintext( int fd ) {
+        char text[] = "Hello, World!\n";
+        int ret = answer_header(fd, sizeof(text));
         if( ret < 0 ) return ret;
+        return answer(fd, buffer, size);
+}
+        return answer(fd, text, sizeof(text));
+}
+#endif
 int answer_empty( int fd ) {
 …
 [HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len, io_cancellation * cancel) {
+[HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len) {
         char * it = buffer;
         size_t count = len - 1;
 …
         READ:
         for() {
                 int ret = cfa_read(fd, (void*)it, count, 0, -1`s, cancel, 0p);
+                int ret = cfa_recv(fd, (void*)it, count, 0, CFA_IO_LAZY);
                 // int ret = read(fd, (void*)it, count);
                 if(ret == 0 ) return [OK200, true, 0, 0];
                 if(ret < 0 ) {
                         if( errno == EAGAIN || errno == EWOULDBLOCK) continue READ;
+                        // if( errno == EINVAL ) return [E400, true, 0, 0];
+                        if( errno == ECONNRESET ) return [E408, true, 0, 0];
+                        if( errno == EPIPE ) return [E408, true, 0, 0];
                         abort( "read error: (%d) %s\n", (int)errno, strerror(errno) );
+                }
 …
+        }
+        if( options.log ) printf("%.*s\n", rlen, buffer);
+        if( options.log ) {
+                write(sout, buffer, rlen);
+                sout | nl;
+        }
         it = buffer;
 …
+}
 void sendfile( int pipe[2], int fd, int ans_fd, size_t count ) {
+int sendfile( int pipe[2], int fd, int ans_fd, size_t count ) {
         unsigned sflags = SPLICE_F_MOVE; // | SPLICE_F_MORE;
         off_t offset = 0;
         ssize_t ret;
         SPLICE1: while(count > 0) {
+                ret = cfa_splice(ans_fd, &offset, pipe[1], 0p, count, sflags, 0, -1`s, 0p, 0p);
+                // ret = splice(ans_fd, &offset, pipe[1], 0p, count, sflags);
+                ret = cfa_splice(ans_fd, &offset, pipe[1], 0p, count, sflags, CFA_IO_LAZY);
                 if( ret < 0 ) {
                         if( errno != EAGAIN && errno != EWOULDBLOCK) continue SPLICE1;
+                        if( errno == ECONNRESET ) return -ECONNRESET;
+                        if( errno == EPIPE ) return -EPIPE;
                         abort( "splice [0] error: (%d) %s\n", (int)errno, strerror(errno) );
+                }
 …
                 size_t in_pipe = ret;
                 SPLICE2: while(in_pipe > 0) {
+                        ret = cfa_splice(pipe[0], 0p, fd, 0p, in_pipe, sflags, 0, -1`s, 0p, 0p);
+                        // ret = splice(pipe[0], 0p, fd, 0p, in_pipe, sflags);
+                        ret = cfa_splice(pipe[0], 0p, fd, 0p, in_pipe, sflags, CFA_IO_LAZY);
                         if( ret < 0 ) {
                                 if( errno != EAGAIN && errno != EWOULDBLOCK) continue SPLICE2;
+                                if( errno == ECONNRESET ) return -ECONNRESET;
+                                if( errno == EPIPE ) return -EPIPE;
                                 abort( "splice [1] error: (%d) %s\n", (int)errno, strerror(errno) );
+                        }
 …
+        }
+        return count;
+}
 …
 #include <thread.hfa>
+const char * original_http_msgs[] = {
+        "HTTP/1.1 200 OK\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: ",
+        "HTTP/1.1 200 OK\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 15\n\nHello, World!\n",
+        "HTTP/1.1 400 Bad Request\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 404 Not Found\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 405 Method Not Allowed\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 408 Request Timeout\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 413 Payload Too Large\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 414 URI Too Long\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+};
 struct date_buffer {
         char buff[100];
+        https_msg_str strs[KNOWN_CODES];
 };
 …
 void ?{}( DateFormater & this ) {
         ((thread&)this){ "Server Date Thread", *options.clopts.instance };
+        ((thread&)this){ "Server Date Thread", *options.clopts.instance[0] };
         this.idx = 0;
         memset( this.buffers[0].buff, 0, sizeof(this.buffers[0]) );
         memset( this.buffers[1].buff, 0, sizeof(this.buffers[1]) );
+        memset( &this.buffers[0], 0, sizeof(this.buffers[0]) );
+        memset( &this.buffers[1], 0, sizeof(this.buffers[1]) );
+}
 …
                 or else {}
+                char buff[100];
                 Time now = getTimeNsec();
+                strftime( this.buffers[this.idx].buff, 100, "%a, %d %b %Y %H:%M:%S %Z", now );
+                char * next = this.buffers[this.idx].buff;
+                __atomic_exchange_n((char * volatile *)&date, next, __ATOMIC_SEQ_CST);
+                strftime( buff, 100, "%a, %d %b %Y %H:%M:%S %Z", now );
+                sout | "Updated date to '" | buff | "'";
+                for(i; KNOWN_CODES) {
+                        size_t len = snprintf( this.buffers[this.idx].strs[i].msg, 512, original_http_msgs[i], buff );
+                        this.buffers[this.idx].strs[i].len = len;
+                }
+                for(i; KNOWN_CODES) {
+                        https_msg_str * next = &this.buffers[this.idx].strs[i];
+                        __atomic_exchange_n((https_msg_str * volatile *)&http_msgs[i], next, __ATOMIC_SEQ_CST);
+                }
                 this.idx = (this.idx + 1) % 2;
+                sout | "Date thread sleeping";
                 sleep(1`s);

benchmark/io/http/protocol.hfa

-              r342af53
+              r8e4aa05
 #pragma once
-struct io_cancellation;
 enum HttpCode {
         OK200 = 0,
+        OK200_PlainText,
         E400,
         E404,
+        E405,
+        E408,
         E413,
         E414,
 …
 int answer_error( int fd, HttpCode code );
 int answer_header( int fd, size_t size );
 int answer_plain( int fd, char buffer [], size_t size );
+int answer_plaintext( int fd );
 int answer_empty( int fd );
 [HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len, io_cancellation *);
+[HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len);
 void sendfile( int pipe[2], int fd, int ans_fd, size_t count );
+int sendfile( int pipe[2], int fd, int ans_fd, size_t count );

benchmark/io/http/worker.cfa

-              r342af53
+              r8e4aa05
 #include <unistd.h>
+#include <fstream.hfa>
 #include <iofwd.hfa>
 …
 //=============================================================================================
 void ?{}( Worker & this ) {
+        ((thread&)this){ "Server Worker Thread", *options.clopts.instance };
+        size_t cli = rand() % options.clopts.cltr_cnt;
+        ((thread&)this){ "Server Worker Thread", *options.clopts.instance[cli] };
+        options.clopts.thrd_cnt[cli]++;
         this.pipe[0] = -1;
         this.pipe[1] = -1;
 …
         CONNECTION:
         for() {
+                if( options.log ) printf("=== Accepting connection ===\n");
+                int fd = cfa_accept4( this.[sockfd, addr, addrlen, flags], 0, -1`s, &this.cancel, 0p );
+                // int fd = accept4( this.[sockfd, addr, addrlen, flags] );
+                if( options.log ) sout | "=== Accepting connection ===";
+                int fd = cfa_accept4( this.[sockfd, addr, addrlen, flags], CFA_IO_LAZY );
                 if(fd < 0) {
                         if( errno == ECONNABORTED ) break;
                         if( errno == EINVAL && this.done ) break;
+                        if( this.done && (errno == EINVAL || errno == EBADF) ) break;
                         abort( "accept error: (%d) %s\n", (int)errno, strerror(errno) );
+                }
+                if(this.done) break;
                 if( options.log ) printf("=== New connection %d, waiting for requests ===\n", fd);
+                if( options.log ) sout | "=== New connection" | fd | "" | ", waiting for requests ===";
                 REQUEST:
                 for() {
 …
                         size_t len = options.socket.buflen;
                         char buffer[len];
                         if( options.log ) printf("=== Reading request ===\n");
                         [code, closed, file, name_size] = http_read(fd, buffer, len, &this.cancel);
+                        if( options.log ) sout | "=== Reading request ===";
+                        [code, closed, file, name_size] = http_read(fd, buffer, len);
                         // if we are done, break out of the loop
+                        if( closed ) {
+                                if( options.log ) printf("=== Connection closed ===\n");
+                                close(fd);
+                                continue CONNECTION;
+                        }
+                        if( closed ) break REQUEST;
                         // If this wasn't a request retrun 400
                         if( code != OK200 ) {
                                 printf("=== Invalid Request : %d ===\n", code_val(code));
+                                sout | "=== Invalid Request :" | code_val(code) | "===";
                                 answer_error(fd, code);
                                 continue REQUEST;
 …
                         if(0 == strncmp(file, "plaintext", min(name_size, sizeof("plaintext") ))) {
                                 if( options.log ) printf("=== Request for /plaintext ===\n");
+                                if( options.log ) sout | "=== Request for /plaintext ===";
+                                char text[] = "Hello, World!\n";
+                                int ret = answer_plaintext(fd);
+                                if( ret == -ECONNRESET ) break REQUEST;
+                                // Send the header
+                                answer_plain(fd, text, sizeof(text));
+                                if( options.log ) printf("=== Answer sent ===\n");
+                                if( options.log ) sout | "=== Answer sent ===";
                                 continue REQUEST;
+                        }
                         if(0 == strncmp(file, "ping", min(name_size, sizeof("ping") ))) {
                                 if( options.log ) printf("=== Request for /ping ===\n");
+                                if( options.log ) sout | "=== Request for /ping ===";
                                 // Send the header
+                                answer_empty(fd);
+                                int ret = answer_empty(fd);
+                                if( ret == -ECONNRESET ) break REQUEST;
                                 if( options.log ) printf("=== Answer sent ===\n");
+                                if( options.log ) sout | "=== Answer sent ===";
                                 continue REQUEST;
+                        }
+                        if( options.log ) printf("=== Request for file %.*s ===\n", (int)name_size, file);
+                        if( options.log ) {
+                                sout | "=== Request for file " | nonl;
+                                write(sout, file, name_size);
+                                sout | " ===";
+                        }
+                        if( !options.file_cache.path ) {
+                                if( options.log ) {
+                                        sout | "=== File Not Found (" | nonl;
+                                        write(sout, file, name_size);
+                                        sout | ") ===";
+                                }
+                                answer_error(fd, E405);
+                                continue REQUEST;
+                        }
                         // Get the fd from the file cache
 …
                         // If we can't find the file, return 404
                         if( ans_fd < 0 ) {
+                                printf("=== File Not Found ===\n");
+                                if( options.log ) {
+                                        sout | "=== File Not Found (" | nonl;
+                                        write(sout, file, name_size);
+                                        sout | ") ===";
+                                }
                                 answer_error(fd, E404);
                                 continue REQUEST;
 …
                         // Send the header
+                        answer_header(fd, count);
+                        int ret = answer_header(fd, count);
+                        if( ret == -ECONNRESET ) break REQUEST;
                         // Send the desired file
+                        sendfile( this.pipe, fd, ans_fd, count);
+                        ret = sendfile( this.pipe, fd, ans_fd, count);
+                        if( ret == -ECONNRESET ) break REQUEST;
                         if( options.log ) printf("=== Answer sent ===\n");
+                        if( options.log ) sout | "=== Answer sent ===";
+                }
+                if( options.log ) sout | "=== Connection closed ===";
+                close(fd);
+                continue CONNECTION;
+        }
+}

benchmark/io/http/worker.hfa

r342af53	r8e4aa05
17	17	socklen_t * addrlen;
18	18	int flags;
19		~~io_cancellation cancel;~~
20	19	volatile bool done;
21	20	};

doc/LaTeXmacros/common.tex

-              r342af53
+              r8e4aa05
 %% Created On       : Sat Apr  9 10:06:17 2016
 %% Last Modified By : Peter A. Buhr
 %% Last Modified On : Mon Oct  5 09:34:46 2020
 %% Update Count     : 464
+%% Last Modified On : Sun Feb 14 15:52:46 2021
+%% Update Count     : 524
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 \setlist[enumerate]{listparindent=\parindent}% global
 \setlist[enumerate,2]{leftmargin=\parindent,labelsep=*,align=parleft,label=\alph*.}% local
 \setlist[description]{itemsep=0pt,listparindent=\parindent,leftmargin=\parindent,labelsep=1.5ex}
+\setlist[description]{topsep=0.5ex,itemsep=0pt,listparindent=\parindent,leftmargin=\parindent,labelsep=1.5ex}
 % Names used in the document.
 \usepackage{xspace}
+\newcommand{\CFAIcon}{\textsf{C}\raisebox{\depth}{\rotatebox{180}{\textsf{A}}}\xspace} % Cforall symbolic name
+\newcommand{\CFA}{\protect\CFAIcon}             % safe for section/caption
+\newcommand{\CFL}{\textrm{Cforall}\xspace} % Cforall symbolic name
+\newcommand{\Celeven}{\textrm{C11}\xspace} % C11 symbolic name
+\newcommand{\CC}{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}\xspace} % C++ symbolic name
+\newcommand{\CCeleven}{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}11\xspace} % C++11 symbolic name
+\newcommand{\CCfourteen}{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}14\xspace} % C++14 symbolic name
+\newcommand{\CCseventeen}{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}17\xspace} % C++17 symbolic name
+\newcommand{\CCtwenty}{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}20\xspace} % C++20 symbolic name
+\newcommand{\CFAIcon}{\textsf{C}\raisebox{\depth}{\rotatebox{180}{\textsf{A}}}} % Cforall icon
+\newcommand{\CFA}{\protect\CFAIcon\xspace}                      % CFA symbolic name
+\newcommand{\CFL}{\textrm{Cforall}\xspace}                      % Cforall non-icon name
+\newcommand{\Celeven}{\textrm{C11}\xspace}                      % C11 symbolic name
+\newcommand{\CCIcon}{\textrm{C}\kern-.1em\hbox{+\kern-.25em+}} % C++ icon
+\newcommand{\CC}{\protect\CCIcon\xspace}                        % C++ symbolic name
+% numbers disallowed in latex variables names => use number names
+\newcommand{\CCeleven}{\protect\CCIcon{11}\xspace}      % C++11 symbolic name
+\newcommand{\CCfourteen}{\protect\CCIcon{14}\xspace} % C++14 symbolic name
+\newcommand{\CCseventeen}{\protect\CCIcon{17}\xspace} % C++17 symbolic name
+\newcommand{\CCtwenty}{\protect\CCIcon{20}\xspace}      % C++20 symbolic name
 \newcommand{\Csharp}{C\raisebox{-0.7ex}{\Large$^\sharp$}\xspace} % C# symbolic name
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% remove special-character warning in PDF side-bar names
 \makeatletter
+\@ifpackageloaded{hyperref}{
+  \pdfstringdefDisableCommands{
+  \def\CFA{\CFL}
+  \def\Celeven{C11\xspace}
+  \def\CC{C++\xspace}
+  \def\CCeleven{C++11\xspace}
+  \def\CCfourteen{C++14\xspace}
+  \def\CCseventeen{C++17\xspace}
+  \def\CCtwenty{C++20\xspace}
+  \def\Csharp{C\#\xspace}
+  \def\lstinline{\xspace}% must use {} as delimiters, e.g., \lstinline{...}
+  }{}
+}
 % parindent is relative, i.e., toggled on/off in environments like itemize, so store the value for
 % use rather than use \parident directly.
 …
     \vskip 50\p@
   }}
 \renewcommand\section{\@startsection{section}{1}{\z@}{-3.5ex \@plus -1ex \@minus -.2ex}{1.75ex \@plus .2ex}{\normalfont\large\bfseries}}
 \renewcommand\subsection{\@startsection{subsection}{2}{\z@}{-3.25ex \@plus -1ex \@minus -.2ex}{1.5ex \@plus .2ex}{\normalfont\normalsize\bfseries}}
+\renewcommand\section{\@startsection{section}{1}{\z@}{-3.0ex \@plus -1ex \@minus -.2ex}{1.5ex \@plus .2ex}{\normalfont\large\bfseries}}
+\renewcommand\subsection{\@startsection{subsection}{2}{\z@}{-2.75ex \@plus -1ex \@minus -.2ex}{1.25ex \@plus .2ex}{\normalfont\normalsize\bfseries}}
 \renewcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}{-2.5ex \@plus -1ex \@minus -.2ex}{1.0ex \@plus .2ex}{\normalfont\normalsize\bfseries}}
 \renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}{-2.0ex \@plus -1ex \@minus -.2ex}{-1em}{\normalfont\normalsize\bfseries}}
 …
 \newcommand{\italic}[1]{\emph{\hyperpage{#1}}}
 \newcommand{\Definition}[1]{\textbf{\hyperpage{#1}}}
 \newcommand{\see}[1]{\emph{see}~#1}
+\newcommand{\see}[1]{(see #1)}
 % Define some commands that produce formatted index entries suitable for cross-references.
 …
 % The star version does not lowercase the index information, e.g., \newterm*{IBM}.
 \newcommand{\newtermFontInline}{\emph}
 \newcommand{\newterm}{\@ifstar\@snewterm\@newterm}
+\newcommand{\newterm}{\protect\@ifstar\@snewterm\@newterm}
 \newcommand{\@newterm}[2][\@empty]{\lowercase{\def\temp{#2}}{\newtermFontInline{#2}}\ifx#1\@empty\index{\temp}\else\index{#1@{\protect#2}}\fi}
 \newcommand{\@snewterm}[2][\@empty]{{\newtermFontInline{#2}}\ifx#1\@empty\index{#2}\else\index{#1@{\protect#2}}\fi}
 …
 \usepackage{listings}                                                                   % format program code
 \usepackage{lstlang}
+\usepackage{calc}                                                                               % latex arithmetic
 \makeatletter
 \newcommand{\LstBasicStyle}[1]{{\lst@basicstyle{#1}}}
 \newcommand{\LstKeywordStyle}[1]{{\lst@basicstyle{\lst@keywordstyle{#1}}}}
 \newcommand{\LstCommentStyle}[1]{{\lst@basicstyle{\lst@commentstyle{#1}}}}
+\newcommand{\LstStringStyle}[1]{{\lst@basicstyle{\lst@stringstyle{#1}}}}
 \newlength{\gcolumnposn}                                % temporary hack because lstlisting does not handle tabs correctly
 …
 xleftmargin=\parindentlnth,                             % indent code to paragraph indentation
 extendedchars=true,                                             % allow ASCII characters in the range 128-255
 escapechar=§,                                                   % LaTeX escape in CFA code §...§ (section symbol), emacs: C-q M-'
 mathescape=true,                                                % LaTeX math escape in CFA code $...$
+escapechar=\$,                                                  % LaTeX escape in CFA code §...§ (section symbol), emacs: C-q M-'
+mathescape=false,                                               % LaTeX math escape in CFA code $...$
 keepspaces=true,                                                %
 showstringspaces=false,                                 % do not show spaces with cup
 showlines=true,                                                 % show blank lines at end of code
 aboveskip=4pt,                                                  % spacing above/below code block
+belowskip=3pt,
+belowskip=0pt,
+numberstyle=\footnotesize\sf,                   % numbering style
 % replace/adjust listing characters that look bad in sanserif
 literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.75ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptscriptstyle\land\,$}}1
 …
 \ifdefined\CFALatin% extra Latin-1 escape characters
 \lstnewenvironment{cfa}[1][]{
+\lstnewenvironment{cfa}[1][]{% necessary
 \lstset{
 language=CFA,
+moredelim=**[is][\color{red}]{®}{®},    % red highlighting ®...® (registered trademark symbol) emacs: C-q M-.
+moredelim=**[is][\color{blue}]{ß}{ß},   % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_
+moredelim=**[is][\color{OliveGreen}]{¢}{¢}, % green highlighting ¢...¢ (cent symbol) emacs: C-q M-"
+moredelim=[is][\lstset{keywords={}}]{¶}{¶}, % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^
+% replace/adjust listing characters that look bad in sanserif
+add to literate={`}{\ttfamily\upshape\hspace*{-0.1ex}`}1
+moredelim=**[is][\color{red}]{@}{@},    % red highlighting @...@
+%moredelim=**[is][\color{red}]{®}{®},   % red highlighting ®...® (registered trademark symbol) emacs: C-q M-.
+%moredelim=**[is][\color{blue}]{ß}{ß},  % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_
+%moredelim=**[is][\color{OliveGreen}]{¢}{¢}, % green highlighting ¢...¢ (cent symbol) emacs: C-q M-"
+%moredelim=[is][\lstset{keywords={}}]{¶}{¶}, % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^
 }% lstset
 \lstset{#1}
+\lstset{#1}% necessary
 }{}
 % inline code ©...© (copyright symbol) emacs: C-q M-)
 \lstMakeShortInline©                                    % single-character for \lstinline
 \else% regular ASCI characters
 \lstnewenvironment{cfa}[1][]{
+\lstnewenvironment{cfa}[1][]{% necessary
 \lstset{
 language=CFA,
 escapechar=\$,                                                  % LaTeX escape in CFA code
+mathescape=false,                                               % LaTeX math escape in CFA code $...$
 moredelim=**[is][\color{red}]{@}{@},    % red highlighting @...@
 }% lstset
 \lstset{#1}
+\lstset{#1}% necessary
 }{}
 % inline code @...@ (at symbol)

doc/bibliography/pl.bib

-              r342af53
+              r8e4aa05
     title       = {Asynchronous Exception Propagation in Blocked Tasks},
     booktitle   = {4th International Workshop on Exception Handling (WEH.08)},
     organization= {16th International Symposium on the Foundations of Software Engineering (FSE 16)},
+    optorganization= {16th International Symposium on the Foundations of Software Engineering (FSE 16)},
     address     = {Atlanta, U.S.A},
     month       = nov,
 …
+}
 @article{Delisle19,
+@article{Delisle20,
     keywords    = {concurrency, Cforall},
     contributer = {pabuhr@plg},
     author      = {Thierry Delisle and Peter A. Buhr},
     title       = {Advanced Control-flow and Concurrency in \textsf{C}$\mathbf{\forall}$},
     year        = 2019,
+    year        = 2020,
     journal     = spe,
+    pages       = {1-33},
+    note        = {submitted},
+    pages       = {1-38},
+    note        = {\href{https://doi-org.proxy.lib.uwaterloo.ca/10.1002/spe.2925}{https://\-doi-org.proxy.lib.uwaterloo.ca/\-10.1002/\-spe.2925}},
+    note        = {},
+}
 …
 @inproceedings{Edelson92,
     keywords    = {persistence, pointers},
+    keywords    = {persistence, smart pointers},
     contributer = {pabuhr@plg},
     author      = {Daniel R. Edelson},
 …
     year        = 1992,
     pages       = {1-19},
+}
+@incollection{smartpointers,
+    keywords    = {smart pointers},
+    contributer = {pabuhr@plg},
+    author      = {Andrei Alexandrescu},
+    title       = {Smart Pointers},
+    booktitle   = {Modern C++ Design: Generic Programming and Design Patterns Applied},
+    publisher   = {Addison-Wesley},
+    year        = 2001,
+    chapter     = 7,
+    optpages    = {?-?},
+}
 …
+}
+@misc{vistorpattern,
+    keywords    = {visitor pattern},
+    contributer = {pabuhr@plg},
+    key         = {vistor pattern},
+    title       = {vistor pattern},
+    year        = 2020,
+    note        = {WikipediA},
+    howpublished= {\href{https://en.wikipedia.org/wiki/Visitor\_pattern}
+                  {https://\-en.wikipedia.org/\-wiki/\-Visitor\_pattern}},
+}
 % W

doc/papers/concurrency/mail2

-              r342af53
+              r8e4aa05
+From: "Wiley Online Proofing" <onlineproofing@eproofing.in>
+To: pabuhr@uwaterloo.ca
+Reply-To: eproofing@wiley.com
+Date: 3 Nov 2020 08:25:06 +0000
+Subject: Action: Proof of SPE_EV_SPE2925 for Software: Practice And Experience ready for review
+Dear Dr. Peter Buhr,
+The proof of your Software: Practice And Experience article Advanced control-flow in Cforall is now available for review:
+Edit Article https://wiley.eproofing.in/Proof.aspx?token=ab7739d5678447fbbe5036f3bcba2445081500061
+To review your article, please complete the following steps, ideally within 48 hours*, so we can publish your article as quickly as possible.
+. Open your proof in the online proofing system using the button above.
+. Check the article for correctness and respond to all queries.For instructions on using the system, please see the "Help" menu in the upper right corner.
+. Submit your changes by clicking the "Submit" button in the proofing system.
+Helpful Tips
+*  Your manuscript has been formatted following the style requirements for the journal. Any requested changes that go against journal style will not be made.
+*  Your proof will include queries. These must be replied to using the system before the proof can be submitted.
+*  The only acceptable changes at this stage are corrections to grammatical errors or data accuracy, or to provide higher resolution figure files (if requested by the typesetter).
+*  Any changes to scientific content or authorship will require editorial review and approval.
+*  Once your changes are complete, submit the article after which no additional corrections can be requested.
+*  Most authors complete their corrections within 48 hours. Returning any corrections promptly will accelerate publication of your article.
+If you encounter any problems or have questions, please contact the production office at (SPEproofs@wiley.com). For the quickest response, include the journal name and your article ID (found in the subject line) in all correspondence.
+Best regards,
+Software: Practice And Experience Production Office
+* We appreciate that the COVID-19 pandemic may create conditions for you that make it difficult for you to review your proof within standard timeframes. If you have any problems keeping to this schedule, please reach out to me at (SPEproofs@wiley.com) to discuss alternatives.
 From: "Pacaanas, Joel -" <jpacaanas@wiley.com>
 To: "Peter A. Buhr" <pabuhr@uwaterloo.ca>
 …
 Since the proof was reset, your added corrections before has also been removed. Please add them back.
 Please return your corrections at your earliest convenience.
 …
 Best regards,
 Joel Pacaanas
+Date: Wed, 2 Dec 2020 08:49:52 +0000
+From: <cs-author@wiley.com>
+To: <pabuhr@uwaterloo.ca>
+Subject: Published: Your article is now published in Early View!
+Dear Peter Buhr,
+Your article Advanced Control-flow and Concurrency in C A in Software: Practice and Experience has the following publication status: Published as Early View
+To access your article, please click the following link to register or log in:
+  https://authorservices.wiley.com/index.html#register
+You can also access your published article via this link: http://dx.doi.org/10.1002/spe.2925
+If you need any assistance, please click here https://hub.wiley.com/community/support/authorservices to view our Help section.
+Sincerely,
+Wiley Author Services
+Date: Wed, 2 Dec 2020 02:16:23 -0500
+From: <no-reply@copyright.com>
+To: <pabuhr@uwaterloo.ca>
+CC: <SPEproofs@wiley.com>
+Subject: Please submit your publication fee(s) SPE2925
+John Wiley and Sons
+Please submit your selection and payment for publication fee(s).
+Dear Peter A. Buhr,
+Congratulations, your article in Software: Practice and Experience has published online:
+Manuscript DOI: 10.1002/spe.2925
+Manuscript ID: SPE2925
+Manuscript Title: Advanced control-flow in Cforall
+Published by: John Wiley and Sons
+Please carefully review your publication options. If you wish your colour
+figures to be printed in colour, you must select and pay for that option now
+using the RightsLink e-commerce solution from CCC.
+  Review my options & pay charges
+  https://oa.copyright.com/apc-payment-ui/overview?id=f46ba36a-2565-4c8d-8865-693bb94d87e5&chargeset=CHARGES
+To review and pay your charge(s), please click here
+<https://oa.copyright.com/apc-payment-ui/overview?id=f46ba36a-2565-4c8d-8865-693bb94d87e5&chargeset=CHARGES>. You
+can also forward this link to another party for processing.
+To complete a secure transaction, you will need a RightsLink account
+<https://oa.copyright.com/apc-payment-ui/registration?id=f46ba36a-2565-4c8d-8865-693bb94d87e5&chargeset=CHARGES>. If
+you do not have one already, you will be prompted to register as you are
+checking out your author charges. This is a very quick process; the majority of
+your registration form will be pre-populated automatically with information we
+have already supplied to RightsLink.
+If you have any questions about these charges, please contact CCC Customer
+Service <wileysupport@copyright.com> using the information below. Please do not
+reply directly to this email as this is an automated email notification sent
+from an unmonitored account.
+Sincerely,
+John Wiley and Sons
+Tel.: +1-877-622-5543 / +1-978-646-2777
+wileysupport@copyright.com
+www.copyright.com
+Copyright Clearance Center
+RightsLink
+This message (including attachments) is confidential, unless marked
+otherwise. It is intended for the addressee(s) only. If you are not an intended
+recipient, please delete it without further distribution and reply to the
+sender that you have received the message in error.
+From: "Pacaanas, Joel -" <jpacaanas@wiley.com>
+To: "Peter A. Buhr" <pabuhr@uwaterloo.ca>
+Subject: RE: Please submit your publication fee(s) SPE2925
+Date: Thu, 3 Dec 2020 08:45:10 +0000
+Dear Dr Buhr,
+Thank you for your email and concern with regard to the RightsLink account. As
+you have mentioned that all figures will be printed as black and white, then I
+have selected it manually from the system to proceed further.
+Best regards,
+Joel
+Joel Q. Pacaanas
+Production Editor
+On behalf of Wiley
+Manila
+We partner with global experts to further innovative research.
+E-mail: jpacaanas@wiley.com
+Tel: +632 88558618
+Fax: +632 5325 0768
+-----Original Message-----
+From: Peter A. Buhr [mailto:pabuhr@uwaterloo.ca]
+Sent: Thursday, December 3, 2020 12:28 AM
+To: SPE Proofs <speproofs@wiley.com>
+Subject: Re: Please submit your publication fee(s) SPE2925
+I am trying to complete the forms to submit my publication fee.
+I clicked all the boxs to print in Black and White, so there is no fee.
+I then am asked to create RightsLink account, which I did.
+However, it requires that I click a box agreeing to:
+   I consent to have my contact information shared with my publisher and/or
+   funding organization, as needed, to facilitate APC payment(s), reporting and
+   customer care.
+I do not agree to this sharing and will not click this button.
+How would you like to proceed?
+From: "Pacaanas, Joel -" <jpacaanas@wiley.com>
+To: "Peter A. Buhr" <pabuhr@uwaterloo.ca>
+Subject: RE: Please submit your publication fee(s) SPE2925
+Date: Fri, 4 Dec 2020 07:55:59 +0000
+Dear Peter,
+Yes, you are now done with this selection.
+Thank you.
+Best regards,
+Joel
+Joel Q. Pacaanas
+Production Editor
+On behalf of Wiley
+Manila
+We partner with global experts to further innovative research.
+E-mail: jpacaanas@wiley.com
+Tel: +632 88558618
+Fax: +632 5325 0768
+-----Original Message-----
+From: Peter A. Buhr [mailto:pabuhr@uwaterloo.ca]
+Sent: Thursday, December 3, 2020 10:29 PM
+To: Pacaanas, Joel - <jpacaanas@wiley.com>
+Subject: Re: Please submit your publication fee(s) SPE2925
+    Thank you for your email and concern with regard to the RightsLink
+    account. As you have mentioned that all figures will be printed as black and
+    white, then I have selected it manually from the system to proceed further.
+Just be clear, am I done? Meaning I do not have to go back to that web-page again.

doc/theses/andrew_beach_MMath/.gitignore

r342af53	r8e4aa05
3	3
4	4	# Final Files:
5		~~thesis~~.pdf
	5	*.pdf
6	6
7	7	# The Makefile here is not generated.

doc/theses/andrew_beach_MMath/Makefile

-              r342af53
+              r8e4aa05
 ### Makefile for Andrew Beach's Masters Thesis
 DOC=thesis.pdf
+DOC=uw-ethesis.pdf
 BUILD=out
 TEXSRC=$(wildcard *.tex)
 …
 STYSRC=$(wildcard *.sty)
 CLSSRC=$(wildcard *.cls)
 TEXLIB= .:${BUILD}:
+TEXLIB= .:../../LaTeXmacros:${BUILD}:
 BIBLIB= .:../../bibliography
 …
         ${LATEX} ${BASE}
         ${BIBTEX} ${BUILD}/${BASE}
+        ${LATEX} ${BASE}
         ${GLOSSARY} ${BUILD}/${BASE}
         ${LATEX} ${BASE}

doc/theses/andrew_beach_MMath/existing.tex

-              r342af53
+              r8e4aa05
+\chapter{\CFA{} Existing Features}
+\section{Overloading and extern}
+Cforall has overloading, allowing multiple definitions of the same name to
+be defined.
+This also adds name mangling so that the assembly symbols are unique for
+different overloads. For compatability with names in C there is also a
+syntax to diable the name mangling. These unmangled names cannot be overloaded
+but act as the interface between C and \CFA code.
+The syntax for disabling mangling is:
+\begin{lstlisting}
+extern "C" {
+    ...
+}
+\end{lstlisting}
+To re-enable mangling once it is disabled the syntax is:
+\begin{lstlisting}
+extern "Cforall" {
+    ...
+}
+\end{lstlisting}
+Both should occur at the declaration level and effect all the declarations
+in \texttt{...}. Neither care about the state of mangling when they begin
+and will return to that state after the group is finished. So re-enabling
+is only used to nest areas of mangled and unmangled declarations.
+\section{References}
+\CFA adds references to C. These are auto-dereferencing pointers and use the
+same syntax as pointers except they use ampersand (\codeCFA{\&}) instead of
+the asterisk (\codeCFA{*}). They can also be constaint or mutable, if they
+are mutable they may be assigned to by using the address-of operator
+(\codeCFA\&) which converts them into a pointer.
+\chapter{\CFA Existing Features}
+\CFA (C-for-all)~\cite{Cforall} is an open-source project extending ISO C with
+modern safety and productivity features, while still ensuring backwards
+compatibility with C and its programmers.  \CFA is designed to have an
+orthogonal feature-set based closely on the C programming paradigm
+(non-object-oriented) and these features can be added incrementally to an
+existing C code-base allowing programmers to learn \CFA on an as-needed basis.
+Only those \CFA features pertinent to this thesis are discussed.  Many of the
+\CFA syntactic and semantic features used in the thesis should be fairly
+obvious to the reader.
+\section{Overloading and \lstinline{extern}}
+\CFA has extensive overloading, allowing multiple definitions of the same name
+to be defined.~\cite{Moss18}
+\begin{cfa}
+char i; int i; double i;                        $\C[3.75in]{// variable overload}$
+int f(); double f();                            $\C{// return overload}$
+void g( int ); void g( double );        $\C{// parameter overload}\CRT$
+\end{cfa}
+This feature requires name mangling so the assembly symbols are unique for
+different overloads. For compatibility with names in C, there is also a syntax
+to disable name mangling. These unmangled names cannot be overloaded but act as
+the interface between C and \CFA code.  The syntax for disabling/enabling
+mangling is:
+\begin{cfa}
+// name mangling
+int i; // _X1ii_1
+@extern "C"@ {  // no name mangling
+        int j; // j
+        @extern "Cforall"@ {  // name mangling
+                int k; // _X1ki_1
+        }
+        // no name mangling
+}
+// name mangling
+\end{cfa}
+Both forms of @extern@ affect all the declarations within their nested lexical
+scope and transition back to the previous mangling state when the lexical scope
+ends.
+\section{Reference Type}
+\CFA adds a rebindable reference type to C, but more expressive than the \Cpp
+reference.  Multi-level references are allowed and act like auto-dereferenced
+pointers using the ampersand (@&@) instead of the pointer asterisk (@*@). \CFA
+references may also be mutable or non-mutable. If mutable, a reference variable
+may be assigned to using the address-of operator (@&@), which converts the
+reference to a pointer.
+\begin{cfa}
+int i, j;
+int @&@ ri = i, @&&@ rri = ri;
+rri = 3;  // auto-dereference assign to i
+@&@ri = @&@j; // rebindable
+ri = 5;   // assign to j
+\end{cfa}
 \section{Constructors and Destructors}
 Both constructors and destructors are operators, which means they are just
+functions with special names. The special names are used to define them and
+may be used to call the functions expicately. The \CFA special names are
+constructed by taking the tokens in the operators and putting \texttt{?} where
+the arguments would go. So multiplication is \texttt{?*?} while dereference
+is \texttt{*?}. This also make it easy to tell the difference between
+pre-fix operations (such as \texttt{++?}) and post-fix operations
+(\texttt{?++}).
+The special name for contructors is \texttt{?\{\}}, which comes from the
+initialization syntax in C. The special name for destructors is
+\texttt{\^{}?\{\}}. % I don't like the \^{} symbol but $^\wedge$ isn't better.
+Any time a type T goes out of scope the destructor matching
+\codeCFA{void ^?\{\}(T \&);} is called. In theory this is also true of
+primitive types such as \codeCFA{int}, but in practice those are no-ops and
+are usually omitted for optimization.
+functions with special operator names rather than type names in \Cpp. The
+special operator names may be used to call the functions explicitly (not
+allowed in \Cpp for constructors).
+In general, operator names in \CFA are constructed by bracketing an operator
+token with @?@, which indicates where the arguments. For example, infixed
+multiplication is @?*?@ while prefix dereference is @*?@. This syntax make it
+easy to tell the difference between prefix operations (such as @++?@) and
+post-fix operations (@?++@).
+The special name for a constructor is @?{}@, which comes from the
+initialization syntax in C. The special name for a destructor is @^{}@, where
+the @^@ has no special meaning.
+% I don't like the \^{} symbol but $^\wedge$ isn't better.
+\begin{cfa}
+struct T { ... };
+void ?@{}@(@T &@ this, ...) { ... }  // constructor
+void ?@^{}@(@T &@ this, ...) { ... } // destructor
+{
+        T s = @{@ ... @}@;  // same constructor/initialization braces
+} // destructor call automatically generated
+\end{cfa}
+The first parameter is a reference parameter to the type for the
+constructor/destructor. Destructors may have multiple parameters.  The compiler
+implicitly matches an overloaded constructor @void ^?{}(T &, ...);@ to an
+object declaration with associated initialization, and generates a construction
+call after the object is allocated. When an object goes out of scope, the
+matching overloaded destructor @void ^?{}(T &);@ is called.  Without explicit
+definition, \CFA creates a default and copy constructor, destructor and
+assignment (like \Cpp). It is possible to define constructors/destructors for
+basic and existing types.
 \section{Polymorphism}
+\CFA uses polymorphism to create functions and types that are defined over
+different types. \CFA polymorphic declarations serve the same role as \CPP
+templates or Java generics.
+Polymorphic declaractions start with a forall clause that goes before the
+standard (monomorphic) declaration. These declarations have the same syntax
+except that you may use the names introduced by the forall clause in them.
+Forall clauses are written \codeCFA{forall( ... )} where \codeCFA{...} becomes
+the list of polymorphic variables (local type names) and assertions, which
+repersent required operations on those types.
+\begin{lstlisting}
+forall(dtype T | { void do_once(T &); })
+void do_twice(T & value) {
+    do_once(value);
+    do_once(value);
+}
+\end{lstlisting}
+A polymorphic function can be used in the same way normal functions are.
+The polymorphics variables are filled in with concrete types and the
+assertions are checked. An assertion checked by seeing if that name of that
+type (with all the variables replaced with the concrete types) is defined at
+the the call site.
+As an example, even if no function named \codeCFA{do_once} is not defined
+near the definition of \codeCFA{do_twice} the following code will work.
+\begin{lstlisting}
+\CFA uses parametric polymorphism to create functions and types that are
+defined over multiple types. \CFA polymorphic declarations serve the same role
+as \Cpp templates or Java generics. The ``parametric'' means the polymorphism is
+accomplished by passing argument operations to associate \emph{parameters} at
+the call site, and these parameters are used in the function to differentiate
+among the types the function operates on.
+Polymorphic declarations start with a universal @forall@ clause that goes
+before the standard (monomorphic) declaration. These declarations have the same
+syntax except they may use the universal type names introduced by the @forall@
+clause.  For example, the following is a polymorphic identity function that
+works on any type @T@:
+\begin{cfa}
+@forall( T )@ @T@ identity( @T@ val ) { return val; }
+int forty_two = identity( 42 ); // T bound to int, forty_two == 42
+\end{cfa}
+To allow a polymorphic function to be separately compiled, the type @T@ must be
+constrained by the operations used on @T@ in the function body. The @forall@
+clauses is augmented with a list of polymorphic variables (local type names)
+and assertions (constraints), which represent the required operations on those
+types used in a function, \eg:
+\begin{cfa}
+forall( T @| { void do_once(T); }@) // assertion
+void do_twice(T value) {
+        do_once(value);
+        do_once(value);
+}
+void do_once(int i) { ... }  // provide assertion
+int i;
+do_twice(i); // implicitly pass assertion do_once to do_twice
+\end{cfa}
+Any object with a type fulfilling the assertion may be passed as an argument to
+a @do_twice@ call.
+A polymorphic function can be used in the same way as a normal function.  The
+polymorphic variables are filled in with concrete types and the assertions are
+checked. An assertion is checked by verifying each assertion operation (with
+all the variables replaced with the concrete types from the arguments) is
+defined at a call site.
+Note, a function named @do_once@ is not required in the scope of @do_twice@ to
+compile it, unlike \Cpp template expansion. Furthermore, call-site inferencing
+allows local replacement of the most specific parametric functions needs for a
+call.
+\begin{cfa}
+void do_once(double y) { ... } // global
 int quadruple(int x) {
+    void do_once(int & y) {
+        y = y * 2;
+    }
+    do_twice(x);
+    return x;
+}
+\end{lstlisting}
+This is not the recommended way to implement a quadruple function but it
+does work. The complier will deduce that \codeCFA{do_twice}'s T is an
+integer from the argument. It will then look for a definition matching the
+assertion which is the \codeCFA{do_once} defined within the function. That
+function will be passed in as a function pointer to \codeCFA{do_twice} and
+called within it.
+To avoid typing out long lists of assertions again and again there are also
+traits which collect assertions into convenent packages that can then be used
+in assertion lists instead of all of their components.
+\begin{lstlisting}
+trait done_once(dtype T) {
+    void do_once(T &);
+}
+\end{lstlisting}
+After this the forall list in the previous example could instead be written
+with the trait instead of the assertion itself.
+\begin{lstlisting}
+forall(dtype T | done_once(T))
+\end{lstlisting}
+Traits can have arbitrary number of assertions in them and are usually used to
+create short hands for, and give descriptive names to, commond groupings of
+assertions.
+Polymorphic structures and unions may also be defined by putting a forall
+clause before the declaration. The type variables work the same way except
+are now used in field declaractions instead of parameters and local variables.
+\begin{lstlisting}
+        void do_once(int y) { y = y * 2; } // local
+        do_twice(x); // using local "do_once"
+        return x;
+}
+\end{cfa}
+Specifically, the complier deduces that @do_twice@'s T is an integer from the
+argument @x@. It then looks for the most specific definition matching the
+assertion, which is the nested integral @do_once@ defined within the
+function. The matched assertion function is then passed as a function pointer
+to @do_twice@ and called within it.
+To avoid typing long lists of assertions, constraints can be collect into
+convenient packages called a @trait@, which can then be used in an assertion
+instead of the individual constraints.
+\begin{cfa}
+trait done_once(T) {
+        void do_once(T);
+}
+\end{cfa}
+and the @forall@ list in the previous example is replaced with the trait.
+\begin{cfa}
+forall(dtype T | @done_once(T)@)
+\end{cfa}
+In general, a trait can contain an arbitrary number of assertions, both
+functions and variables, and are usually used to create a shorthand for, and
+give descriptive names to, common groupings of assertions describing a certain
+functionality, like @sumable@, @listable@, \etc.
+Polymorphic structures and unions are defined by qualifying the aggregate type
+with @forall@. The type variables work the same except they are used in field
+declarations instead of parameters, returns, and local variable declarations.
+\begin{cfa}
 forall(dtype T)
 struct node {
+    node(T) * next;
+    T * data;
+}
+\end{lstlisting}
+The \codeCFA{node(T)} is a use of a polymorphic structure. Polymorphic types
+must be provided their polymorphic parameters.
+There are many other features of polymorphism that have not given here but
+these are the ones used by the exception system.
+        node(T) * next;  // generic linked node
+        T * data;
+}
+\end{cfa}
+The generic type @node(T)@ is an example of a polymorphic-type usage.  Like \Cpp
+templates usage, a polymorphic-type usage must specify a type parameter.
+There are many other polymorphism features in \CFA but these are the ones used
+by the exception system.
 \section{Concurrency}
+\CFA has a number of concurrency features, \codeCFA{thread}s,
+\codeCFA{monitor}s and \codeCFA{mutex} parameters, \codeCFA{coroutine}s and
+\codeCFA{generator}s. The two features that interact with the exception system
+are \codeCFA{thread}s and \codeCFA{coroutine}s; they and their supporting
+constructs will be described here.
+\subsection{Coroutines}
+Coroutines are routines that do not have to finish execution to hand control
+back to their caller, instead they may suspend their execution at any time
+and resume it later.
+Coroutines are not true concurrency but share some similarities and many of
+the same underpinnings and so are included as part of the \CFA threading
+library.
+In \CFA coroutines are created using the \codeCFA{coroutine} keyword which
+works just like \codeCFA{struct} except that the created structure will be
+modified by the compiler to satify the \codeCFA{is_coroutine} trait.
+These structures act as the interface between callers and the coroutine,
+the fields are used to pass information in and out. Here is a simple example
+where the single field is used to pass the next number in a sequence out.
+\begin{lstlisting}
+\CFA has a number of concurrency features: @thread@, @monitor@, @mutex@
+parameters, @coroutine@ and @generator@. The two features that interact with
+the exception system are @thread@ and @coroutine@; they and their supporting
+constructs are described here.
+\subsection{Coroutine}
+A coroutine is a type with associated functions, where the functions are not
+required to finish execution when control is handed back to the caller. Instead
+they may suspend execution at any time and be resumed later at the point of
+last suspension. (Generators are stackless and coroutines are stackful.) These
+types are not concurrent but share some similarities along with common
+underpinnings, so they are combined with the \CFA threading library. Further
+discussion in this section only refers to the coroutine because generators are
+similar.
+In \CFA, a coroutine is created using the @coroutine@ keyword, which is an
+aggregate type like @struct,@ except the structure is implicitly modified by
+the compiler to satisfy the @is_coroutine@ trait; hence, a coroutine is
+restricted by the type system to types that provide this special trait.  The
+coroutine structure acts as the interface between callers and the coroutine,
+and its fields are used to pass information in and out of coroutine interface
+functions.
+Here is a simple example where a single field is used to pass (communicate) the
+next number in a sequence.
+\begin{cfa}
 coroutine CountUp {
+    unsigned int next;
+}
+\end{lstlisting}
+The routine part of the coroutine is a main function for the coroutine. It
+takes a reference to a coroutine object and returns nothing. In this function,
+and any functions called by this function, the suspend statement may be used
+to return execution to the coroutine's caller. When control returns to the
+function it continue from that same suspend statement instead of at the top
+of the function.
+\begin{lstlisting}
+void main(CountUp & this) {
+    unsigned int next = 0;
+    while (true) {
+        this.next = next;
+        suspend;
+        next = next + 1;
+    }
+}
+\end{lstlisting}
+Control is passed to the coroutine with the resume function. This includes the
+first time when the coroutine is starting up. The resume function takes a
+reference to the coroutine structure and returns the same reference. The
+return value is for easy access to communication variables. For example the
+next value from a count-up can be generated and collected in a single
+expression: \codeCFA{resume(count).next}.
+        unsigned int next; // communication variable
+}
+CountUp countup;
+\end{cfa}
+Each coroutine has @main@ function, which takes a reference to a coroutine
+object and returns @void@.
+\begin{cfa}[numbers=left]
+void main(@CountUp & this@) { // argument matches trait is_coroutine
+        unsigned int up = 0;  // retained between calls
+        while (true) {
+                next = up; // make "up" available outside function
+                @suspend;@$\label{suspend}$
+                up += 1;
+        }
+}
+\end{cfa}
+In this function, or functions called by this function (helper functions), the
+@suspend@ statement is used to return execution to the coroutine's caller
+without terminating the coroutine.
+A coroutine is resumed by calling the @resume@ function, \eg @resume(countup)@.
+The first resume calls the @main@ function at the top. Thereafter, resume calls
+continue a coroutine in the last suspended function after the @suspend@
+statement, in this case @main@ line~\ref{suspend}.  The @resume@ function takes
+a reference to the coroutine structure and returns the same reference. The
+return value allows easy access to communication variables defined in the
+coroutine object. For example, the @next@ value for coroutine object @countup@
+is both generated and collected in the single expression:
+@resume(countup).next@.
 \subsection{Monitors and Mutex}
+True concurrency does not garrenty ordering. To get some of that ordering back
+\CFA uses monitors and mutex (mutual exclution) parameters. A monitor is
+another special declaration that contains a lock and is compatable with mutex
+parameters.
+Function parameters can have the \codeCFA{mutex} qualifiers on reference
+arguments, for example \codeCFA{void example(a_monitor & mutex arg);}. When the
+function is called it will acquire the lock on all of the mutex parameters.
+This means that all functions that mutex on a type are part of a critical
+section and only one will ever run at a time.
+Concurrency does not guarantee ordering; without ordering results are
+non-deterministic. To claw back ordering, \CFA uses monitors and @mutex@
+(mutual exclusion) parameters. A monitor is another kind of aggregate, where
+the compiler implicitly inserts a lock and instances are compatible with
+@mutex@ parameters.
+A function that requires deterministic (ordered) execution, acquires mutual
+exclusion on a monitor object by qualifying an object reference parameter with
+@mutex@.
+\begin{cfa}
+void example(MonitorA & @mutex@ argA, MonitorB & @mutex@ argB);
+\end{cfa}
+When the function is called, it implicitly acquires the monitor lock for all of
+the mutex parameters without deadlock.  This semantics means all functions with
+the same mutex type(s) are part of a critical section for objects of that type
+and only one runs at a time.
 \subsection{Threads}
+While coroutines allow new things to be done with a single execution path
+threads actually introduce new paths of execution that continue independently.
+Now for threads to work together their must be some communication between them
+and that means the timing of certain operations does have to be known. There
+or various means of syncronization and mutual exclution provided by \CFA but
+for exceptions only the basic two -- fork and join -- are needed.
+Threads are created like coroutines except the keyword is changed:
+\begin{lstlisting}
+Functions, generators, and coroutines are sequential so there is only a single
+(but potentially sophisticated) execution path in a program. Threads introduce
+multiple execution paths that continue independently.
+For threads to work safely with objects requires mutual exclusion using
+monitors and mutex parameters. For threads to work safely with other threads,
+also requires mutual exclusion in the form of a communication rendezvous, which
+also supports internal synchronization as for mutex objects. For exceptions
+only the basic two basic operations are important: thread fork and join.
+Threads are created like coroutines with an associated @main@ function:
+\begin{cfa}
 thread StringWorker {
     const char * input;
     int result;
+        const char * input;
+        int result;
 };
 void main(StringWorker & this) {
+    const char * localCopy = this.input;
+    // ... do some work, perhaps hashing the string ...
+    this.result = result;
+}
+\end{lstlisting}
+The main function will start executing after the fork operation and continue
+executing until it is finished. If another thread joins with this one it will
+wait until main has completed execution. In other words everything the thread
+does is between fork and join.
+From the outside this is the creation and destruction of the thread object.
+Fork happens after the constructor is run and join happens before the
+destructor runs. Join also happens during the \codeCFA{join} function which
+can be used to join a thread earlier. If it is used the destructor does not
+join as that has already been completed.
+        const char * localCopy = this.input;
+        // ... do some work, perhaps hashing the string ...
+        this.result = result;
+}
+{
+        StringWorker stringworker; // fork thread running in "main"
+} // implicitly join with thread $\(\Rightarrow\)$ wait for completion
+\end{cfa}
+The thread main is where a new thread starts execution after a fork operation
+and then the thread continues executing until it is finished. If another thread
+joins with an executing thread, it waits until the executing main completes
+execution. In other words, everything a thread does is between a fork and join.
+From the outside, this behaviour is accomplished through creation and
+destruction of a thread object.  Implicitly, fork happens after a thread
+object's constructor is run and join happens before the destructor runs. Join
+can also be specified explicitly using the @join@ function to wait for a
+thread's completion independently from its deallocation (\ie destructor
+call). If @join@ is called explicitly, the destructor does not implicitly join.

doc/theses/andrew_beach_MMath/features.tex

-              r342af53
+              r8e4aa05
+\chapter{Features}
+This chapter covers the design and user interface of the \CFA exception
+handling mechanism.
+\section{Virtual Casts}
+Virtual casts and virtual types are not truly part of the exception system but
+they did not exist in \CFA and are useful in exceptions. So a minimal version
+of they virtual system was designed and implemented.
+Virtual types are organizied in simple hierarchies. Each virtual type may have
+a parent and can have any number of children. A type's descendants are its
+children and its children's descendants. A type may not be its own descendant.
+Each virtual type has an associated virtual table type. A virtual table is a
+structure that has fields for all the virtual members of a type. A virtual
+type has all the virtual members of its parent and can add more. It may also
+update the values of the virtual members.
+Except for virtual casts, this is only used internally in the exception
+system. There is no general purpose interface for the other features. A
+a virtual cast has the following syntax:
+\begin{lstlisting}
+\chapter{Exception Features}
+This chapter covers the design and user interface of the \CFA
+exception-handling mechanism.
+\section{Virtuals}
+Virtual types and casts are not part of the exception system nor are they
+required for an exception system. But an object-oriented style hierarchy is a
+great way of organizing exceptions so a minimal virtual system has been added
+to \CFA.
+The pattern of a simple hierarchy was borrowed from object-oriented
+programming was chosen for several reasons.
+The first is that it allows new exceptions to be added in user code
+and in libraries independently of each other. Another is it allows for
+different levels of exception grouping (all exceptions, all IO exceptions or
+a particular IO exception). Also it also provides a simple way of passing
+data back and forth across the throw.
+Virtual types and casts are not required for a basic exception-system but are
+useful for advanced exception features. However, \CFA is not object-oriented so
+there is no obvious concept of virtuals. Hence, to create advanced exception
+features for this work, I needed to design and implement a virtual-like
+system for \CFA.
+% NOTE: Maybe we should but less of the rational here.
+Object-oriented languages often organized exceptions into a simple hierarchy,
+\eg Java.
+\begin{center}
+\setlength{\unitlength}{4000sp}%
+\begin{picture}(1605,612)(2011,-1951)
+\put(2100,-1411){\vector(1, 0){225}}
+\put(3450,-1411){\vector(1, 0){225}}
+\put(3550,-1411){\line(0,-1){225}}
+\put(3550,-1636){\vector(1, 0){150}}
+\put(3550,-1636){\line(0,-1){225}}
+\put(3550,-1861){\vector(1, 0){150}}
+\put(2025,-1490){\makebox(0,0)[rb]{\LstBasicStyle{exception}}}
+\put(2400,-1460){\makebox(0,0)[lb]{\LstBasicStyle{arithmetic}}}
+\put(3750,-1460){\makebox(0,0)[lb]{\LstBasicStyle{underflow}}}
+\put(3750,-1690){\makebox(0,0)[lb]{\LstBasicStyle{overflow}}}
+\put(3750,-1920){\makebox(0,0)[lb]{\LstBasicStyle{zerodivide}}}
+\end{picture}%
+\end{center}
+The hierarchy provides the ability to handle an exception at different degrees
+of specificity (left to right). Hence, it is possible to catch a more general
+exception-type in higher-level code where the implementation details are
+unknown, which reduces tight coupling to the lower-level implementation.
+Otherwise, low-level code changes require higher-level code changes, \eg,
+changing from raising @underflow@ to @overflow@ at the low level means changing
+the matching catch at the high level versus catching the general @arithmetic@
+exception. In detail, each virtual type may have a parent and can have any
+number of children. A type's descendants are its children and its children's
+descendants. A type may not be its own descendant.
+The exception hierarchy allows a handler (@catch@ clause) to match multiple
+exceptions, \eg a base-type handler catches both base and derived
+exception-types.
+\begin{cfa}
+try {
+        ...
+} catch(arithmetic &) {
+        ... // handle arithmetic, underflow, overflow, zerodivide
+}
+\end{cfa}
+Most exception mechanisms perform a linear search of the handlers and select
+the first matching handler, so the order of handers is now important because
+matching is many to one.
+Each virtual type needs an associated virtual table. A virtual table is a
+structure with fields for all the virtual members of a type. A virtual type has
+all the virtual members of its parent and can add more. It may also update the
+values of the virtual members and often does.
+While much of the virtual infrastructure is created, it is currently only used
+internally for exception handling. The only user-level feature is the virtual
+cast, which is the same as the \Cpp \lstinline[language=C++]|dynamic_cast|.
+\label{p:VirtualCast}
+\begin{cfa}
 (virtual TYPE)EXPRESSION
+\end{lstlisting}
+This has the same precidence as a traditional C-cast and can be used in the
+same places. This will convert the result of EXPRESSION to the type TYPE. Both
+the type of EXPRESSION and TYPE must be pointers to virtual types.
+The cast is checked and will either return the original value or null, based
+on the result of the check. The check is does the object pointed at have a
+type that is a descendant of the target type. If it is the result is the
+pointer, otherwise the result is null.
+\section{Exceptions}
+\end{cfa}
+Note, the syntax and semantics matches a C-cast, rather than the function-like
+\Cpp syntax for special casts. Both the type of @EXPRESSION@ and @TYPE@ must be
+a pointer to a virtual type.
+The cast dynamically checks if the @EXPRESSION@ type is the same or a subtype
+of @TYPE@, and if true, returns a pointer to the
+@EXPRESSION@ object, otherwise it returns @0p@ (null pointer).
+\section{Exception}
 % Leaving until later, hopefully it can talk about actual syntax instead
 % of my many strange macros. Syntax aside I will also have to talk about the
 % features all exceptions support.
+\section{Termination}
+Termination exception throws are likely the most framilar kind, as they are
+used in several popular programming languages. A termination will throw an
+exception, search the stack for a handler, unwind the stack to where the
+handler is defined, execute the handler and then continue execution after
+the handler. They are used when execution cannot continue here.
+Termination has two pieces of syntax it uses. The first is the throw:
+\begin{lstlisting}
+Exceptions are defined by the trait system; there are a series of traits, and
+if a type satisfies them, then it can be used as an exception. The following
+is the base trait all exceptions need to match.
+\begin{cfa}
+trait is_exception(exceptT &, virtualT &) {
+        virtualT const & get_exception_vtable(exceptT *);
+};
+\end{cfa}
+The trait is defined over two types, the exception type and the virtual table
+type. This should be one-to-one, each exception type has only one virtual
+table type and vice versa. The only assertion in the trait is
+@get_exception_vtable@, which takes a pointer of the exception type and
+returns a reference to the virtual table type instance.
+The function @get_exception_vtable@ is actually a constant function.
+Recardless of the value passed in (including the null pointer) it should
+return a reference to the virtual table instance for that type.
+The reason it is a function instead of a constant is that it make type
+annotations easier to write as you can use the exception type instead of the
+virtual table type; which usually has a mangled name.
+% Also \CFA's trait system handles functions better than constants and doing
+% it this way reduce the amount of boiler plate we need.
+% I did have a note about how it is the programmer's responsibility to make
+% sure the function is implemented correctly. But this is true of every
+% similar system I know of (except Agda's I guess) so I took it out.
+There are two more traits for exceptions @is_termination_exception@ and
+@is_resumption_exception@. They are defined as follows:
+\begin{cfa}
+trait is_termination_exception(
+                exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
+        void defaultTerminationHandler(exceptT &);
+};
+trait is_resumption_exception(
+                exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
+        void defaultResumptionHandler(exceptT &);
+};
+\end{cfa}
+In other words they make sure that a given type and virtual type is an
+exception and defines one of the two default handlers. These default handlers
+are used in the main exception handling operations \see{Exception Handling}
+and their use will be detailed there.
+However all three of these traits can be trickly to use directly.
+There is a bit of repetition required but
+the largest issue is that the virtual table type is mangled and not in a user
+facing way. So there are three macros that can be used to wrap these traits
+when you need to refer to the names:
+@IS_EXCEPTION@, @IS_TERMINATION_EXCEPTION@ and @IS_RESUMPTION_EXCEPTION@.
+All take one or two arguments. The first argument is the name of the
+exception type. Its unmangled and mangled form are passed to the trait.
+The second (optional) argument is a parenthesized list of polymorphic
+arguments. This argument should only with polymorphic exceptions and the
+list will be passed to both types.
+In the current set-up the base name and the polymorphic arguments have to
+match so these macros can be used without losing flexability.
+For example consider a function that is polymorphic over types that have a
+defined arithmetic exception:
+\begin{cfa}
+forall(Num | IS_EXCEPTION(Arithmetic, (Num)))
+void some_math_function(Num & left, Num & right);
+\end{cfa}
+\section{Exception Handling}
+\CFA provides two kinds of exception handling, termination and resumption.
+These twin operations are the core of the exception handling mechanism and
+are the reason for the features of exceptions.
+This section will cover the general patterns shared by the two operations and
+then go on to cover the details each individual operation.
+Both operations follow the same set of steps to do their operation. They both
+start with the user preforming a throw on an exception.
+Then there is the search for a handler, if one is found than the exception
+is caught and the handler is run. After that control returns to normal
+execution.
+If the search fails a default handler is run and then control
+returns to normal execution immediately. That is where the default handlers
+@defaultTermiationHandler@ and @defaultResumptionHandler@ are used.
+\subsection{Termination}
+\label{s:Termination}
+Termination handling is more familiar kind and used in most programming
+languages with exception handling.
+It is dynamic, non-local goto. If a throw is successful then the stack will
+be unwound and control will (usually) continue in a different function on
+the call stack. They are commonly used when an error has occured and recovery
+is impossible in the current function.
+% (usually) Control can continue in the current function but then a different
+% control flow construct should be used.
+A termination throw is started with the @throw@ statement:
+\begin{cfa}
 throw EXPRESSION;
+\end{lstlisting}
+The expression must evaluate to a reference to a termination exception. A
+termination exception is any exception with a
+\codeCFA{void defaultTerminationHandler(T &);} (the default handler) defined
+on it. The handler is taken from the call sight with \CFA's trait system and
+passed into the exception system along with the exception itself.
+The exception passed into the system is then copied into managed memory.
+This is to ensure it remains in scope during unwinding. It is the user's
+responsibility to make sure the original exception is freed when it goes out
+of scope. Being allocated on the stack is sufficient for this.
+Then the exception system will search the stack starting from the throw and
+proceding towards the base of the stack, from callee to caller. As it goes
+it will check any termination handlers it finds:
+\begin{lstlisting}
+try {
+    TRY_BLOCK
+} catch (EXCEPTION_TYPE * NAME) {
+    HANDLER
+}
+\end{lstlisting}
+This shows a try statement with a single termination handler. The statements
+in TRY\_BLOCK will be executed when control reaches this statement. While
+those statements are being executed if a termination exception is thrown and
+it is not handled by a try statement further up the stack the EHM will check
+all of the terminations handlers attached to the try block, top to bottom.
+At each handler the EHM will check to see if the thrown exception is a
+descendant of EXCEPTION\_TYPE. If it is the pointer to the exception is
+bound to NAME and the statements in HANDLER are executed. If control reaches
+the end of the handler then it exits the block, the exception is freed and
+control continues after the try statement.
+The default handler is only used if no handler for the exception is found
+after the entire stack is searched. When that happens the default handler
+is called with a reference to the exception as its only argument. If the
+handler returns control continues from after the throw statement.
+\paragraph{Conditional Catches}
+Catch clauses may also be written as:
+\begin{lstlisting}
+\end{cfa}
+The expression must return a reference to a termination exception, where the
+termination exception is any type that satifies @is_termination_exception@
+at the call site.
+Through \CFA's trait system the functions in the traits are passed into the
+throw code. A new @defaultTerminationHandler@ can be defined in any scope to
+change the throw's behavior (see below).
+The throw will copy the provided exception into managed memory. It is the
+user's responcibility to ensure the original exception is cleaned up if the
+stack is unwound (allocating it on the stack should be sufficient).
+Then the exception system searches the stack using the copied exception.
+It starts starts from the throw and proceeds to the base of the stack,
+from callee to caller.
+At each stack frame, a check is made for resumption handlers defined by the
+@catch@ clauses of a @try@ statement.
+\begin{cfa}
+try {
+        GUARDED_BLOCK
+} catch (EXCEPTION_TYPE$\(_1\)$ * NAME$\(_1\)$) {
+        HANDLER_BLOCK$\(_1\)$
+} catch (EXCEPTION_TYPE$\(_2\)$ * NAME$\(_2\)$) {
+        HANDLER_BLOCK$\(_2\)$
+}
+\end{cfa}
+When viewed on its own a try statement will simply exceute the statements in
+@GUARDED_BLOCK@ and when those are finished the try statement finishes.
+However, while the guarded statements are being executed, including any
+functions they invoke, all the handlers following the try block are now
+or any functions invoked from those
+statements, throws an exception, and the exception
+is not handled by a try statement further up the stack, the termination
+handlers are searched for a matching exception type from top to bottom.
+Exception matching checks the representation of the thrown exception-type is
+the same or a descendant type of the exception types in the handler clauses. If
+it is the same of a descendent of @EXCEPTION_TYPE@$_i$ then @NAME@$_i$ is
+bound to a pointer to the exception and the statements in @HANDLER_BLOCK@$_i$
+are executed. If control reaches the end of the handler, the exception is
+freed and control continues after the try statement.
+If no handler is found during the search then the default handler is run.
+Through \CFA's trait system the best match at the throw sight will be used.
+This function is run and is passed the copied exception. After the default
+handler is run control continues after the throw statement.
+There is a global @defaultTerminationHandler@ that cancels the current stack
+with the copied exception. However it is generic over all exception types so
+new default handlers can be defined for different exception types and so
+different exception types can have different default handlers.
+\subsection{Resumption}
+\label{s:Resumption}
+Resumption exception handling is a less common form than termination but is
+just as old~\cite{Goodenough75} and is in some sense simpler.
+It is a dynamic, non-local function call. If the throw is successful a
+closure will be taken from up the stack and executed, after which the throwing
+function will continue executing.
+These are most often used when an error occured and if the error is repaired
+then the function can continue.
+A resumption raise is started with the @throwResume@ statement:
+\begin{cfa}
+throwResume EXPRESSION;
+\end{cfa}
+The semantics of the @throwResume@ statement are like the @throw@, but the
+expression has return a reference a type that satifies the trait
+@is_resumption_exception@. The assertions from this trait are available to
+the exception system while handling the exception.
+At runtime, no copies are made. As the stack is not unwound the exception and
+any values on the stack will remain in scope while the resumption is handled.
+Then the exception system searches the stack using the provided exception.
+It starts starts from the throw and proceeds to the base of the stack,
+from callee to caller.
+At each stack frame, a check is made for resumption handlers defined by the
+@catchResume@ clauses of a @try@ statement.
+\begin{cfa}
+try {
+        GUARDED_BLOCK
+} catchResume (EXCEPTION_TYPE$\(_1\)$ * NAME$\(_1\)$) {
+        HANDLER_BLOCK$\(_1\)$
+} catchResume (EXCEPTION_TYPE$\(_2\)$ * NAME$\(_2\)$) {
+        HANDLER_BLOCK$\(_2\)$
+}
+\end{cfa}
+If the handlers are not involved in a search this will simply execute the
+@GUARDED_BLOCK@ and then continue to the next statement.
+Its purpose is to add handlers onto the stack.
+(Note, termination and resumption handlers may be intermixed in a @try@
+statement but the kind of throw must be the same as the handler for it to be
+considered as a possible match.)
+If a search for a resumption handler reaches a try block it will check each
+@catchResume@ clause, top-to-bottom.
+At each handler if the thrown exception is or is a child type of
+@EXCEPTION_TYPE@$_i$ then the a pointer to the exception is bound to
+@NAME@$_i$ and then @HANDLER_BLOCK@$_i$ is executed. After the block is
+finished control will return to the @throwResume@ statement.
+Like termination, if no resumption handler is found, the default handler
+visible at the throw statement is called. It will use the best match at the
+call sight according to \CFA's overloading rules. The default handler is
+passed the exception given to the throw. When the default handler finishes
+execution continues after the throw statement.
+There is a global @defaultResumptionHandler@ is polymorphic over all
+termination exceptions and preforms a termination throw on the exception.
+The @defaultTerminationHandler@ for that throw is matched at the original
+throw statement (the resumption @throwResume@) and it can be customized by
+introducing a new or better match as well.
+% \subsubsection?
+A key difference between resumption and termination is that resumption does
+not unwind the stack. A side effect that is that when a handler is matched
+and run it's try block (the guarded statements) and every try statement
+searched before it are still on the stack. This can lead to the recursive
+resumption problem.
+The recursive resumption problem is any situation where a resumption handler
+ends up being called while it is running.
+Consider a trivial case:
+\begin{cfa}
+try {
+        throwResume (E &){};
+} catchResume(E *) {
+        throwResume (E &){};
+}
+\end{cfa}
+When this code is executed the guarded @throwResume@ will throw, start a
+search and match the handler in the @catchResume@ clause. This will be
+call and placed on the stack on top of the try-block. The second throw then
+throws and will seach the same try block and put call another instance of the
+same handler leading to an infinite loop.
+This situation is trivial and easy to avoid, but much more complex cycles
+can form with multiple handlers and different exception types.
+To prevent all of these cases we mask sections of the stack, or equvilantly
+the try statements on the stack, so that the resumption seach skips over
+them and continues with the next unmasked section of the stack.
+A section of the stack is marked when it is searched to see if it contains
+a handler for an exception and unmarked when that exception has been handled
+or the search was completed without finding a handler.
+% This might need a diagram. But it is an important part of the justification
+% of the design of the traversal order.
+\begin{verbatim}
+       throwResume2 ----------.
+            |                 |
+ generated from handler       |
+            |                 |
+         handler              |
+            |                 |
+        throwResume1 -----.   :
+            |             |   :
+           try            |   : search skip
+            |             |   :
+        catchResume  <----'   :
+            |                 |
+\end{verbatim}
+The rules can be remembered as thinking about what would be searched in
+termination. So when a throw happens in a handler; a termination handler
+skips everything from the original throw to the original catch because that
+part of the stack has been unwound, a resumption handler skips the same
+section of stack because it has been masked.
+A throw in a default handler will preform the same search as the original
+throw because; for termination nothing has been unwound, for resumption
+the mask will be the same.
+The symmetry with termination is why this pattern was picked. Other patterns,
+such as marking just the handlers that caught, also work but lack the
+symmetry whih means there is more to remember.
+\section{Conditional Catch}
+Both termination and resumption handler clauses can be given an additional
+condition to further control which exceptions they handle:
+\begin{cfa}
 catch (EXCEPTION_TYPE * NAME ; CONDITION)
+\end{lstlisting}
+This has the same behaviour as a regular catch clause except that if the
+exception matches the given type the condition is also run. If the result is
+true only then is this considered a matching handler. If the result is false
+then the handler does not match and the search continues with the next clause
+in the try block.
+The condition considers all names in scope at the beginning of the try block
+to be in scope along with the name introduce in the catch clause itself.
+\paragraph{Re-Throwing}
+You can also rethrow the most recent termination exception with
+\codeCFA{throw;}. % This is terrible and you should never do it.
+This can be done in a handler or any function that could be called from a
+handler.
+This will start another termination throw reusing the exception, meaning it
+does not copy the exception or allocated any more memory for it. However the
+default handler is still at the original through and could refer to data that
+was on the unwound section of the stack. So instead a new default handler that
+does a program level abort is used.
+\section{Resumption}
+Resumption exceptions are less popular then termination but in many
+regards are simpler and easier to understand. A resumption throws an exception,
+searches for a handler on the stack, executes that handler on top of the stack
+and then continues execution from the throw. These are used when a problem
+needs to be fixed before execution continues.
+A resumption is thrown with a throw resume statement:
+\begin{lstlisting}
+throwResume EXPRESSION;
+\end{lstlisting}
+The result of EXPRESSION must be a resumption exception type. A resumption
+exception type is any type that satifies the assertion
+\codeCFA{void defaultResumptionHandler(T &);} (the default handler). When the
+statement is executed the expression is evaluated and the result is thrown.
+Handlers are declared using clauses in try statements:
+\begin{lstlisting}
+try {
+    TRY_BLOCK
+} catchResume (EXCEPTION_TYPE * NAME) {
+    HANDLER
+}
+\end{lstlisting}
+This is a simple example with the try block and a single resumption handler.
+Multiple resumption handlers can be put in a try statement and they can be
+mixed with termination handlers.
+When a resumption begins it will start searching the stack starting from
+the throw statement and working its way to the callers. In each try statement
+handlers will be tried top to bottom. Each handler is checked by seeing if
+the thrown exception is a descendant of EXCEPTION\_TYPE. If not the search
+continues. Otherwise NAME is bound to a pointer to the exception and the
+HANDLER statements are executed. After they are finished executing control
+continues from the throw statement.
+If no approprate handler is found then the default handler is called. The
+throw statement acts as a regular function call passing the exception to
+the default handler and after the handler finishes executing control continues
+from the throw statement.
+The exception system also tracks the position of a search on the stack. If
+another resumption exception is thrown while a resumption handler is running
+it will first check handlers pushed to the stack by the handler and any
+functions it called, then it will continue from the try statement that the
+handler is a part of; except for the default handler where it continues from
+the throw the default handler was passed to.
+This makes the search pattern for resumption reflect the one for termination,
+which is what most users expect.
+% This might need a diagram. But it is an important part of the justifaction
+% of the design of the traversal order.
+It also avoids the recursive resumption problem. If the entire stack is
+searched loops of resumption can form. Consider a handler that handles an
+exception of type A by resuming an exception of type B and on the same stack,
+later in the search path, is a second handler that handles B by resuming A.
+Assuming no other handlers on the stack handle A or B then in either traversal
+system an A resumed from the top of the stack will be handled by the first
+handler. A B resumed from the top or from the first handler it will be handled
+by the second hander. The only difference is when A is thrown from the second
+handler. The entire stack search will call the first handler again, creating a
+loop. Starting from the position in the stack though will break this loop.
+\paragraph{Conditional Catches}
+Resumption supports conditional catch clauses like termination does. They
+use the same syntax except the keyword is changed:
+\begin{lstlisting}
+catchResume (EXCEPTION_TYPE * NAME ; CONDITION)
+\end{lstlisting}
+It also has the same behaviour, after the exception type has been matched
+with the EXCEPTION\_TYPE the CONDITION is evaluated with NAME in scope. If
+the result is true then the hander is run, otherwise the search continues
+just as if there had been a type mismatch.
+\paragraph{Re-Throwing}
+You may also re-throw resumptions with a \codeCFA{throwResume;} statement.
+This can only be done from inside of a \codeCFA{catchResume} block.
+Outside of any side effects of any code already run in the handler this will
+have the same effect as if the exception had not been caught in the first
+place.
+\end{cfa}
+First, the same semantics is used to match the exception type. Second, if the
+exception matches, @CONDITION@ is executed. The condition expression may
+reference all names in scope at the beginning of the try block and @NAME@
+introduced in the handler clause. If the condition is true, then the handler
+matches. Otherwise, the exception search continues as if the exception type
+did not match.
+\begin{cfa}
+try {
+        f1 = open( ... );
+        f2 = open( ... );
+        ...
+} catch( IOFailure * f ; fd( f ) == f1 ) {
+        // only handle IO failure for f1
+}
+\end{cfa}
+Note, catching @IOFailure@, checking for @f1@ in the handler, and reraising the
+exception if not @f1@ is different because the reraise does not examine any of
+remaining handlers in the current try statement.
+\section{Rethrowing}
+\colour{red}{From Andrew: I recomend we talk about why the language doesn't
+have rethrows/reraises instead.}
+\label{s:Rethrowing}
+Within the handler block or functions called from the handler block, it is
+possible to reraise the most recently caught exception with @throw@ or
+@throwResume@, respectively.
+\begin{cfa}
+try {
+        ...
+} catch( ... ) {
+        ... throw;
+} catchResume( ... ) {
+        ... throwResume;
+}
+\end{cfa}
+The only difference between a raise and a reraise is that reraise does not
+create a new exception; instead it continues using the current exception, \ie
+no allocation and copy. However the default handler is still set to the one
+visible at the raise point, and hence, for termination could refer to data that
+is part of an unwound stack frame. To prevent this problem, a new default
+handler is generated that does a program-level abort.
 \section{Finally Clauses}
+A \codeCFA{finally} clause may be placed at the end of a try statement after
+all the handler clauses. In the simply case, with no handlers, it looks like
+this:
+\begin{lstlisting}
+try {
+    TRY_BLOCK
+} finally {
+    FINAL_STATEMENTS
+}
+\end{lstlisting}
+Any number of termination handlers and resumption handlers may proceed the
+finally clause.
+The FINAL\_STATEMENTS, the finally block, are executed whenever the try
+statement is removed from the stack. This includes: the TRY\_BLOCK finishes
+executing, a termination exception finishes executing and the stack unwinds.
+Execution of the finally block should finish by letting control run off
+the end of the block. This is because after the finally block is complete
+control will continue to where ever it would if the finally clause was not
+present.
+Because of this local control flow out of the finally block is forbidden.
+The compiler rejects uses of \codeCFA{break}, \codeCFA{continue},
+\codeCFA{fallthru} and \codeCFA{return} that would cause control to leave
+the finally block. Other ways to leave the finally block - such as a long
+jump or termination - are much harder to check, at best requiring additional
+runtime overhead, and so are merely discouraged.
+Finally clauses are used to preform unconditional clean-up when leaving a
+scope. They are placed at the end of a try statement:
+\begin{cfa}
+try {
+        GUARDED_BLOCK
+} ... // any number or kind of handler clauses
+... finally {
+        FINALLY_BLOCK
+}
+\end{cfa}
+The @FINALLY_BLOCK@ is executed when the try statement is removed from the
+stack, including when the @GUARDED_BLOCK@ finishes, any termination handler
+finishes or during an unwind.
+The only time the block is not executed is if the program is exited before
+the stack is unwound.
+Execution of the finally block should always finish, meaning control runs off
+the end of the block. This requirement ensures always continues as if the
+finally clause is not present, \ie finally is for cleanup not changing control
+flow. Because of this requirement, local control flow out of the finally block
+is forbidden. The compiler precludes any @break@, @continue@, @fallthru@ or
+@return@ that causes control to leave the finally block. Other ways to leave
+the finally block, such as a long jump or termination are much harder to check,
+and at best requiring additional run-time overhead, and so are mearly
+discouraged.
+Not all languages with exceptions have finally clauses. Notably \Cpp does
+without it as descructors serve a similar role. Although destructors and
+finally clauses can be used in many of the same areas they have their own
+use cases like top-level functions and lambda functions with closures.
+Destructors take a bit more work to set up but are much easier to reuse while
+finally clauses are good for once offs and can include local information.
 \section{Cancellation}
+Cancellation can be thought of as a stack-level abort or as an uncatchable
+termination. It unwinds the entirety of the current exception and if possible
+passes an exception to a different stack as a message.
+There is no special statement for starting a cancellation, instead you call
+the standard libary function \codeCFA{cancel\_stack} which takes an exception.
+Unlike in a throw this exception is not used in control flow but is just there
+to pass information about why the cancellation happened.
+The handler is decided entirely by which stack is being cancelled. There are
+three handlers that apply to three different groups of stacks:
+\begin{itemize}
+\item Main Stack:
+The main stack is the one on which the program main is called at the beginning
+of your program. It is also the only stack you have without the libcfathreads.
+Because of this there is no other stack ``above" (or possibly at all) for main
+to notify when a cancellation occurs. So after the stack is unwound we do a
+program level abort.
+\item Thread Stack:
+Thread stacks are those created \codeCFA{thread} or otherwise satify the
+\codeCFA{is\_thread} trait.
+Threads only have two structural points of communication that must happen,
+start and join. As the thread must be running to preform a cancellation it
+will be after start and before join, so join is one cancellation uses.
+After the stack is unwound the thread will halt as if had completed normally
+and wait for another thread to join with it. The other thread, when it joins,
+checks for a cancellation. If so it will throw the resumption exception
+\codeCFA{ThreadCancelled}.
+There is a difference here in how explicate joins (with the \codeCFA{join}
+function) and implicate joins (from a destructor call). Explicate joins will
+take the default handler (\codeCFA{defaultResumptionHandler}) from the context
+and use like a regular through does if the exception is not caught. The
+implicate join does a program abort instead.
+This is for safety. One of the big problems in exceptions is you cannot handle
+two terminations or cancellations on the same stack as either can destroy the
+context required for the other. This can happen with join but as the
+destructors will always be run when the stack is being unwound and one
+termination/cancellation is already active. Also since they are implicite they
+are easier to forget about.
+\item Coroutine Stack:
+Coroutine stacks are those created with \codeCFA{coroutine} or otherwise
+satify the \codeCFA{is\_coroutine} trait.
+A coroutine knows of two other coroutines, its starter and its last resumer.
+The last resumer is ``closer" so that is the one notified.
+After the stack is unwound control goes to the last resumer.
+Resume will resume throw a \codeCFA{CoroutineCancelled} exception, which is
+polymorphic over the coroutine type and has a pointer to the coroutine being
+cancelled and the cancelling exception. The resume function also has an
+assertion that the \codeCFA{defaultResumptionHandler} for the exception. So it
+will use the default handler like a regular throw.
+\end{itemize}
+Cancellation is a stack-level abort, which can be thought of as as an
+uncatchable termination. It unwinds the entirety of the current stack, and if
+possible forwards the cancellation exception to a different stack.
+Cancellation is not an exception operation like termination or resumption.
+There is no special statement for starting a cancellation; instead the standard
+library function @cancel_stack@ is called passing an exception. Unlike a
+throw, this exception is not used in matching only to pass information about
+the cause of the cancellation.
+(This also means matching cannot fail so there is no default handler either.)
+After @cancel_stack@ is called the exception is copied into the exception
+handling mechanism's memory. Then the entirety of the current stack is
+unwound. After that it depends one which stack is being cancelled.
+\begin{description}
+\item[Main Stack:]
+The main stack is the one used by the program main at the start of execution,
+and is the only stack in a sequential program. Even in a concurrent program
+the main stack is only dependent on the environment that started the program.
+Hence, when the main stack is cancelled there is nowhere else in the program
+to notify. After the stack is unwound, there is a program-level abort.
+\item[Thread Stack:]
+A thread stack is created for a @thread@ object or object that satisfies the
+@is_thread@ trait. A thread only has two points of communication that must
+happen: start and join. As the thread must be running to perform a
+cancellation, it must occur after start and before join, so join is used
+for communication here.
+After the stack is unwound, the thread halts and waits for
+another thread to join with it. The joining thread checks for a cancellation,
+and if present, resumes exception @ThreadCancelled@.
+There is a subtle difference between the explicit join (@join@ function) and
+implicit join (from a destructor call). The explicit join takes the default
+handler (@defaultResumptionHandler@) from its calling context, which is used if
+the exception is not caught. The implicit join does a program abort instead.
+This semantics is for safety. If an unwind is triggered while another unwind
+is underway only one of them can proceed as they both want to ``consume'' the
+stack. Letting both try to proceed leads to very undefined behaviour.
+Both termination and cancellation involve unwinding and, since the default
+@defaultResumptionHandler@ preforms a termination that could more easily
+happen in an implicate join inside a destructor. So there is an error message
+and an abort instead.
+\todo{Perhaps have a more general disucssion of unwind collisions before
+this point.}
+The recommended way to avoid the abort is to handle the intial resumption
+from the implicate join. If required you may put an explicate join inside a
+finally clause to disable the check and use the local
+@defaultResumptionHandler@ instead.
+\item[Coroutine Stack:] A coroutine stack is created for a @coroutine@ object
+or object that satisfies the @is_coroutine@ trait. A coroutine only knows of
+two other coroutines, its starter and its last resumer. Of the two the last
+resumer has the tightest coupling to the coroutine it activated and the most
+up-to-date information.
+Hence, cancellation of the active coroutine is forwarded to the last resumer
+after the stack is unwound. When the resumer restarts, it resumes exception
+@CoroutineCancelled@, which is polymorphic over the coroutine type and has a
+pointer to the cancelled coroutine.
+The resume function also has an assertion that the @defaultResumptionHandler@
+for the exception. So it will use the default handler like a regular throw.
+\end{description}

doc/theses/andrew_beach_MMath/future.tex

-              r342af53
+              r8e4aa05
 \chapter{Future Work}
+\section{Language Improvements}
+\CFA is a developing programming language. As such, there are partially or
+unimplemented features of the language (including several broken components)
+that I had to workaround while building an exception handling system largely in
+the \CFA language (some C components).  The following are a few of these
+issues, and once implemented/fixed, how this would affect the exception system.
+\begin{itemize}
+\item
+The implementation of termination is not portable because it includes
+hand-crafted assembly statements. These sections must be ported by hand to
+support more hardware architectures, such as the ARM processor.
+\item
+Due to a type-system problem, the catch clause cannot bind the exception to a
+reference instead of a pointer. Since \CFA has a very general reference
+capability, programmers will want to use it. Once fixed, this capability should
+result in little or no change in the exception system.
+\item
+Termination handlers cannot use local control-flow transfers, \eg by @break@,
+@return@, \etc. The reason is that current code generation hoists a handler
+into a nested function for convenience (versus assemble-code generation at the
+@try@ statement). Hence, when the handler runs, its code is not in the lexical
+scope of the @try@ statement, where the local control-flow transfers are
+meaningful.
+\item
+There is no detection of colliding unwinds. It is possible for clean-up code
+run during an unwind to trigger another unwind that escapes the clean-up code
+itself; such as a termination exception caught further down the stack or a
+cancellation. There do exist ways to handle this but currently they are not
+even detected and the first unwind will simply be forgotten, often leaving
+it in a bad state.
+\item
+Also the exception system did not have a lot of time to be tried and tested.
+So just letting people use the exception system more will reveal new
+quality of life upgrades that can be made with time.
+\end{itemize}
 \section{Complete Virtual System}
 The virtual system should be completed. It was never supposed to be part of
+this project and so minimal work was done on it. A draft of what the complete
+system might look like was created but it was never finalized or implemented.
+A future project in \CFA would be to complete that work and to update the
 parts of the exception system that use the current version.
+The virtual system should be completed. It was not supposed to be part of this
+project, but was thrust upon it to do exception inheritance; hence, only
+minimal work was done. A draft for a complete virtual system is available but
+it is not finalized.  A future \CFA project is to complete that work and then
+update the exception system that uses the current version.
+For instance a full virtual system would probably allow for several
+improvements to the exception traits. Although they do currently work they
+could be made easier to use by making the virtual table type implitate in the
+trait (which would remove the need for those wrapper marcos) or allowing
+for assertions that give the layout of a virtual table for safety.
+There are several improvements to the virtual system that would improve the
+exception traits. The most important one is an assertion to check one virtual
+type is a child of another. This check precisely captures many of the
+correctness requirements.
+\section{Additional Throws}
+Several other kinds of throws, beyond the termination throw (\codeCFA{throw}),
+the resumption throw (\codeCFA{throwResume}) and the re-throws, were considered.
+None were as useful as the core throws but they would likely be worth
+revising.
+The full virtual system might also include other improvement like associated
+types to allow traits to refer to types not listed in their header. This
+feature allows exception traits to not refer to the virtual-table type
+explicitly, removing the need for the current interface macros.
+The first ones are throws for asynchronous exceptions, throwing exceptions
+from one stack to another. These act like signals allowing for communication
+between the stacks. This is usually used with resumption as it allows the
+target stack to continue execution normally after the exception has been
+handled.
+\section{Additional Raises}
+Several other kinds of exception raises were considered beyond termination
+(@throw@), resumption (@throwResume@), and reraise.
+This would much more coordination between the concurrency system and the
+exception system to handle. Most of the interesting design decisions around
+applying asynchronous exceptions appear to be around masking (controlling
+which exceptions may be thrown at a stack). It would likely require more of
+the virtual system and would also effect how default handlers are set.
+The first is a non-local/concurrent raise providing asynchronous exceptions,
+\ie raising an exception on another stack. This semantics acts like signals
+allowing for out-of-band communication among coroutines and threads. This kind
+of raise is often restricted to resumption to allow the target stack to
+continue execution normally after the exception has been handled. That is,
+allowing one coroutine/thread to unwind the stack of another via termination is
+bad software engineering.
+The other throws were designed to mimic bidirectional algebraic effects.
+Algebraic effects are used in some functional languages and allow a function
+Non-local/concurrent requires more coordination between the concurrency system
+and the exception system. Many of the interesting design decisions centre
+around masking (controlling which exceptions may be thrown at a stack). It
+would likely require more of the virtual system and would also effect how
+default handlers are set.
+Other raises were considered to mimic bidirectional algebraic effects.
+Algebraic effects are used in some functional languages allowing one function
 to have another function on the stack resolve an effect (which is defined with
+a function-like interface).
+These can be mimiced with resumptions and the the new throws were designed
+to try and mimic bidirectional algebraic effects, where control can go back
+and forth between the function effect caller and handler while the effect
+is underway.
+a functional-like interface).  This semantics can be mimicked with resumptions
+and new raises were discussed to mimic bidirectional algebraic-effects, where
+control can go back and forth between the function-effect caller and handler
+while the effect is underway.
 % resume-top & resume-reply
+These raises would be like the resumption raise except using different search
+patterns to find the handler.
+These throws would likely be just like the resumption throw except they would
+use different search patterns to find the handler to reply to.
+\section{Zero-Cost Try}
+\CFA does not have zero-cost try-statements because the compiler generates C
+code rather than assembler code \see{\VPageref{p:zero-cost}}. When the compiler
+does create its own assembly (or LLVM byte-code), then zero-cost try-statements
+are possible. The downside of zero-cost try-statements is the LSDA complexity,
+its size (program bloat), and the high cost of raising an exception.
+\section{Zero-Cost Exceptions}
+\CFA does not have zero-cost exceptions because it does not generate assembly
+but instead generates C code. See the implementation section. When the
+compiler does start to create its own assembly (or LLVM byte code) then
+zero-cost exceptions could be implemented.
+Alternatively, some research could be done into the simpler alternative method
+with a non-zero-cost try-statement but much lower cost exception raise. For
+example, programs are starting to use exception in the normal control path, so
+more exceptions are thrown. In these cases, the cost balance switches towards
+low-cost raise. Unfortunately, while exceptions remain exceptional, the
+libunwind model will probably remain the most effective option.
+Now in zero-cost exceptions the only part that is zero-cost are the try
+blocks. Some research could be done into the alternative methods for systems
+that expect a lot more exceptions to be thrown, allowing some overhead in
+entering and leaving try blocks to make throws faster. But while exceptions
+remain exceptional the libunwind model will probably remain the most effective
+option.
+Zero-cost resumptions is still an open problem. First, because libunwind does
+not support a successful-exiting stack-search without doing an unwind.
+Workarounds are possible but awkward. Ideally an extension to libunwind could
+be made, but that would either require separate maintenance or gain enough
+support to have it folded into the standard.
+Zero-cost resumptions have more problems to solve. First because libunwind
+does not support a successful exiting stack search without doing an unwind.
+There are several ways to hack that functionality in. Ideally an extension to
+libunwind could be made, but that would either require seperate maintenance
+or gain enough support to have it folded into the standard.
+Also new techniques to skip previously searched parts of the stack need to be
+developed to handle the recursive resume problem and support advanced algebraic
+effects.
+Also new techniques to skip previously searched parts of the stack will have
+to be developed.
+\section{Signal Exceptions}
+Goodenough~\cite{Goodenough75} suggests three types of exceptions: escape,
+notify and signal.  Escape are termination exceptions, notify are resumption
+exceptions, leaving signal unimplemented.
+\section{Support for More Platforms}
+Termination is not portable because it is implemented with inline assembly.
+Those sections will have to be rewritten to support different architectures
+A signal exception allows either behaviour, \ie after an exception is handled,
+the handler has the option of returning to the raise or after the @try@
+statement. Currently, \CFA fixes the semantics of the handler return
+syntactically by the @catch@ or @catchResume@ clause.
+\section{Quality-of-Life Improvements}
+Finally come various improvements to the usability of \CFA. Most of these
+would just require time. Time that would not lead to interesting research so
+it has been left aside for now. A few examples are included here but there
+are more:
+Signal exception should be reexamined and possibly be supported in \CFA. A very
+direct translation is to have a new raise and catch pair, and a new statement
+(or statements) would indicate if the handler returns to the raise or continues
+where it is; but there may be other options.
+\begin{itemize}
+\item Allowing exception handler to bind the exception to a reference instead
+of a pointer. This should actually result in no change in behaviour so there
+is no reason not to allow it. It is however a small improvement; giving a bit
+of flexibility to the user in what style they want to use.
+\item Enabling local control flow (by \codeCFA{break}, \codeCFA{return} and
+similar statements) out of a termination handler. The current set-up makes
+this very difficult but the catch function that runs the handler after it has
+been matched could be inlined into the function's body, which would make this
+much easier. (To do the same for try blocks would probably wait for zero-cost
+exceptions, which would allow the try block to be inlined as well.)
+\item Enabling local control flow out of a resumption handler. This would be
+a weighty operation, causing a stack unwind like a termination, so there might
+be a different statement or a statement modifier to make sure the user does
+this purposefully.
+For instance, resumption could be extended to cover this use by allowing local
+control flow out of it. This approach would require an unwind as part of the
+transition as there are stack frames that have to be removed.  This approach
+means there is no notify raise, but because \CFA does not have exception
+signatures, a termination can be thrown from within any resumption handler so
+there is already a way to do mimic this in existing \CFA.
+However this would require the more complex system as they cannot be inlined
+into the original function as they can be run at a different place on the
+stack. So instead the unwinding will have to carry with it information on
+which one of these points to continue at and possibly also the return value
+for the function if a \codeCFA{return} statement was used.
+\end{itemize}
+% Maybe talk about the escape; and escape CONTROL_STMT; statements or how
+% if we could choose if _Unwind_Resume proceeded to the clean-up stage this
+% would be much easier to implement.

doc/theses/andrew_beach_MMath/implement.tex

-              r342af53
+              r8e4aa05
 % Goes over how all the features are implemented.
+The implementation work for this thesis covers two components: the virtual
+system and exceptions. Each component is discussed in detail.
 \section{Virtual System}
+\label{s:VirtualSystem}
 % Virtual table rules. Virtual tables, the pointer to them and the cast.
+The \CFA virtual system only has one public facing feature: virtual casts.
+However there is a lot of structure to support that and provide some other
+features for the standard library.
+All of this is accessed through a field inserted at the beginning of every
+virtual type. Currently it is called \codeC{virtual_table} but it is not
+ment to be accessed by the user. This field is a pointer to the type's
+virtual table instance. It is assigned once during the object's construction
+and left alone after that.
+\subsection{Virtual Table Construction}
+For each virtual type a virtual table is constructed. This is both a new type
+and an instance of that type. Other instances of the type could be created
+but the system doesn't use them. So this section will go over the creation of
+the type and the instance.
+Creating the single instance is actually very important. The address of the
+table acts as the unique identifier for the virtual type. Similarly the first
+field in every virtual table is the parent's id; a pointer to the parent
+virtual table instance.
+The remaining fields contain the type's virtual members. First come the ones
+present on the parent type, in the same order as they were the parent, and
+then any that this type introduces. The types of the ones inherited from the
+parent may have a slightly modified type, in that references to the
+dispatched type are replaced with the current virtual type. These are always
+taken by pointer or reference.
+The structure itself is created where the virtual type is created. The name
+of the type is created by mangling the name of the base type. The name of the
+instance is also generated by name mangling.
+The fields are initialized automatically.
+While the \CFA virtual system currently has only one public feature, virtual
+cast \see{\VPageref{p:VirtualCast}}, substantial structure is required to
+support it, and provide features for exception handling and the standard
+library.
+\subsection{Virtual Type}
+Virtual types only have one change to their structure, the addition of a
+pointer to the virtual table. This is always the first field so that
+if it is cast to a supertype the field's location is still known.
+This field is set as part of all new generated constructors.
+\todo{They only come as part exceptions and don't work.}
+After the object is created the field is constant.
+However it can be read from, internally it is just a regular field called
+@virtual_table@. Dereferencing it gives the virtual table and access to the
+type's virtual members.
+\subsection{Virtual Table}
+Every time a virtual type is defined the new virtual table type must also be
+defined.
+The unique instance is important because the address of the virtual table
+instance is used as the identifier for the virtual type. So a pointer to the
+virtual table and the ID for the virtual type are interchangable.
+\todo{Unique instances might be going so we will have to talk about the new
+system instead.}
+The first step in putting it all together is to create the virtual table type.
+The virtual table type is just a structure and can be described in terms of
+its fields. The first field is always the parent type ID (or a pointer to
+the parent virtual table) or 0 (the null pointer).
+Next are other fields on the parent virtual table are repeated.
+Finally are the fields used to store any new virtual members of the new
+The virtual type
+The virtual system is accessed through a private constant field inserted at the
+beginning of every virtual type, called the virtual-table pointer. This field
+points at a type's virtual table and is assigned during the object's
+construction. The address of a virtual table acts as the unique identifier for
+the virtual type, and the first field of a virtual table is a pointer to the
+parent virtual-table or @0p@. The remaining fields are duplicated from the
+parent tables in this type's inheritance chain, followed by any fields this type
+introduces. Parent fields are duplicated so they can be changed (all virtual
+members are overridable), so that references to the dispatched type
+are replaced with the current virtual type.
+% These are always taken by pointer or reference.
+% Simple ascii diragram:
+\begin{verbatim}
+parent_pointer  \
+parent_field0   |
+...             | Same layout as parent.
+parent_fieldN   /
+child_field0
+...
+child_fieldN
+\end{verbatim}
+\todo{Refine the diagram}
+% For each virtual type, a virtual table is constructed. This is both a new type
+% and an instance of that type. Other instances of the type could be created
+% but the system doesn't use them. So this section will go over the creation of
+% the type and the instance.
+A virtual table is created when the virtual type is created. The name of the
+type is created by mangling the name of the base type. The name of the instance
+is also generated by name mangling. The fields are initialized automatically.
 The parent field is initialized by getting the type of the parent field and
 using that to calculate the mangled name of the parent's virtual table type.
 There are two special fields that are included like normal fields but have
+special initialization rules: the \codeC{size} field is the type's size and is
+initialized with a sizeof expression, the \codeC{align} field is the type's
+alignment and uses an alignof expression. The remaining fields are resolved
+to a name matching the field's name and type using the normal visibility
+and overload resolution rules of the type system.
+These operations are split up into several groups depending on where they
+take place which can vary for monomorphic and polymorphic types. The first
+devision is between the declarations and the definitions. Declarations, such
+as a function signature or a structure's name, must always be visible but may
+be repeated so they go in headers. Definitions, such as function bodies and a
+structure's layout, don't have to be visible on use but must occur exactly
+once and go into source files.
+special initialization rules: the @size@ field is the type's size and is
+initialized with a @sizeof@ expression, the @align@ field is the type's
+alignment and uses an @alignof@ expression. The remaining fields are resolved
+to a name matching the field's name and type using the normal visibility and
+overload resolution rules of the type system.
+These operations are split up into several groups depending on where they take
+place which varies for monomorphic and polymorphic types. The first devision is
+between the declarations and the definitions. Declarations, such as a function
+signature or a aggregate's name, must always be visible but may be repeated in
+the form of forward declarations in headers. Definitions, such as function
+bodies and a aggregate's layout, can be separately compiled but must occur
+exactly once in a source file.
+\begin{sloppypar}
 The declarations include the virtual type definition and forward declarations
 of the virtual table instance, constructor, message function and
+\codeCFA{get_exception_vtable}. The definition includes the storage and
+initialization of the virtual table instance and the bodies of the three
+functions.
+@get_exception_vtable@. The definition includes the storage and initialization
+of the virtual table instance and the bodies of the three functions.
+\end{sloppypar}
 Monomorphic instances put all of these two groups in one place each.
+Polymorphic instances also split out the core declarations and definitions
+from the per-instance information. The virtual table type and most of the
+functions are polymorphic so they are all part of the core. The virtual table
+instance and the \codeCFA{get_exception_vtable} function.
+Coroutines and threads need instances of \codeCFA{CoroutineCancelled} and
+\codeCFA{ThreadCancelled} respectively to use all of their functionality.
+When a new data type is declared with \codeCFA{coroutine} or \codeCFA{thread}
+the forward declaration for the instance is created as well. The definition
+of the virtual table is created at the definition of the main function.
+Polymorphic instances also split out the core declarations and definitions from
+the per-instance information. The virtual table type and most of the functions
+are polymorphic so they are all part of the core. The virtual table instance
+and the @get_exception_vtable@ function.
+\begin{sloppypar}
+Coroutines and threads need instances of @CoroutineCancelled@ and
+@ThreadCancelled@ respectively to use all of their functionality. When a new
+data type is declared with @coroutine@ or @thread@ the forward declaration for
+the instance is created as well. The definition of the virtual table is created
+at the definition of the main function.
+\end{sloppypar}
 \subsection{Virtual Cast}
+Virtual casts are implemented as a function call that does the check and a
+old C-style cast to do the type conversion. The C-cast is just to make sure
+the generated code is correct so the rest of the section is about that
+function.
+The function is \codeC{__cfa__virtual_cast} and it is implemented in the
+standard library. It takes a pointer to the target type's virtual table and
+the object pointer being cast. The function is very simple, getting the
+object's virtual table pointer and then checking to see if it or any of
+its ancestors, by using the parent pointers, are the same as the target type
+virtual table pointer. It does this in a simple loop.
+For the generated code a forward decaration of the virtual works as follows.
+There is a forward declaration of \codeC{__cfa__virtual_cast} in every cfa
+file so it can just be used. The object argument is the expression being cast
+so that is just placed in the argument list.
+To build the target type parameter the compiler will create a mapping from
+concrete type-name -- so for polymorphic types the parameters are filled in
+-- to virtual table address. Every virtual table declaraction is added to the
+this table; repeats are ignored unless they have conflicting definitions.
+This does mean the declaractions have to be in scope, but they should usually
+be introduced as part of the type definition.
+Virtual casts are implemented as a function call that does the subtype check
+and a C coercion-cast to do the type conversion.
+% The C-cast is just to make sure the generated code is correct so the rest of
+% the section is about that function.
+The function is
+\begin{cfa}
+void * __cfa__virtual_cast(
+        struct __cfa__parent_vtable const * parent,
+        struct __cfa__parent_vtable const * const * child );
+\end{cfa}
+and it is implemented in the standard library. The structure reperents the
+head of a vtable which is the pointer to the parent virtual table. The
+@parent@ points directly at the parent type virtual table while the @child@
+points at the object of the (possibe) child type.
+In terms of the virtual cast expression, @parent@ comes from looking up the
+type being cast to and @child@ is the result of the expression being cast.
+Because the complier outputs C code, some type C type casts are also used.
+The last bit of glue is an map that saves every virtual type the compiler
+sees. This is used to check the type used in a virtual cast is a virtual
+type and to get its virtual table.
+(It also checks for conflicting definitions.)
+Inside the function it is a simple conditional. If the type repersented by
+@parent@ is or is an ancestor of the type repersented by @*child@ (it
+requires one more level of derefence to pass through the object) then @child@
+is returned, otherwise the null pointer is returned.
+The check itself is preformed is a simple linear search. If the child
+virtual table or any of its ancestors (which are retreved through the first
+field of every virtual table) are the same as the parent virtual table then
+the cast succeeds.
 \section{Exceptions}
 …
 % resumption doesn't as well.
+Many modern languages work with an interal stack that function push and pop
+their local data to. Stack unwinding removes large sections of the stack,
+often across functions.
+At a very basic level this can be done with \codeC{setjmp} \& \codeC{longjmp}
+which simply move the top of the stack, discarding everything on the stack
+above a certain point. However this ignores all the clean-up code that should
+be run when certain sections of the stack are removed (for \CFA these are from
+destructors and finally clauses) and also requires that the point to which the
+stack is being unwound is known ahead of time. libunwind is used to address
+both of these problems.
+Libunwind, provided in \texttt{unwind.h} on most platorms, is a C library
+that provides \CPP style stack unwinding. Its operation is divided into two
+phases. The search phase -- phase 1 -- is used to scan the stack and decide
+where the unwinding will stop, this allows for a dynamic target. The clean-up
+phase -- phase 2 -- does the actual unwinding and also runs any clean-up code
+as it goes.
+To use the libunwind each function must have a personality function and an
+LSDA (Language Specific Data Area). Libunwind actually does very little, it
+simply moves down the stack from function to function. Most of the actions are
+implemented by the personality function which libunwind calls on every
+function. Since this is shared across many functions or even every function in
+a language it will need a bit more information. This is provided by the LSDA
+which has the unique information for each function.
+Theoretically the LSDA can contain anything but conventionally it is a table
+with entries reperenting areas of the function and what has to be done there
+during unwinding. These areas are described in terms of where the instruction
+pointer is. If the current value of the instruction pointer is between two
+values reperenting the beginning and end of a region then execution is
+currently being executed. These are used to mark out try blocks and the
+scopes of objects with destructors to run.
+GCC will generate an LSDA and attach its personality function with the
+\texttt{-fexceptions} flag. However this only handles the cleanup attribute.
+This attribute is used on a variable and specifies a function that should be
+run when the variable goes out of scope. The function is passed a pointer to
+the object as well so it can be used to mimic destructors. It however cannot
+be used to mimic try statements.
+\subsection{Implementing Personality Functions}
+Personality functions have a complex interface specified by libunwind.
+This section will cover some of the important parts of that interface.
+\begin{lstlisting}
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int version,
+    _Unwind_Action action,
+    _Unwind_Exception_Class exception_class,
+    _Unwind_Exception * exception,
+    struct _Unwind_Context * context);
+% Many modern languages work with an interal stack that function push and pop
+% their local data to. Stack unwinding removes large sections of the stack,
+% often across functions.
+Stack unwinding is the process of removing stack frames (activations) from the
+stack. On function entry and return, unwinding is handled directly by the code
+embedded in the function. Usually, the stack-frame size is known statically
+based on parameter and local variable declarations. For dynamically-sized
+local variables, a runtime computation is necessary to know the frame
+size. Finally, a function's frame-size may change during execution as local
+variables (static or dynamic sized) go in and out of scope.
+Allocating/deallocating stack space is usually an $O(1)$ operation achieved by
+bumping the hardware stack-pointer up or down as needed.
+Unwinding across multiple stack frames is more complex because individual stack
+management code associated with each frame is bypassed. That is, the location
+of a function's frame-management code is largely unknown and dispersed
+throughout the function, hence the current frame size managed by that code is
+also unknown. Hence, code unwinding across frames does not have direct
+knowledge about what is on the stack, and hence, how much of the stack needs to
+be removed.
+% At a very basic level this can be done with @setjmp@ \& @longjmp@ which simply
+% move the top of the stack, discarding everything on the stack above a certain
+% point. However this ignores all the cleanup code that should be run when
+% certain sections of the stack are removed (for \CFA these are from destructors
+% and finally clauses) and also requires that the point to which the stack is
+% being unwound is known ahead of time. libunwind is used to address both of
+% these problems.
+The traditional unwinding mechanism for C is implemented by saving a snap-shot
+of a function's state with @setjmp@ and restoring that snap-shot with
+@longjmp@. This approach bypasses the need to know stack details by simply
+reseting to a snap-shot of an arbitrary but existing function frame on the
+stack. It is up to the programmer to ensure the snap-shot is valid when it is
+reset, making this unwinding approach fragile with potential errors that are
+difficult to debug because the stack becomes corrupted.
+However, many languages define cleanup actions that must be taken when objects
+are deallocated from the stack or blocks end, such as running a variable's
+destructor or a @try@ statement's @finally@ clause. Handling these mechanisms
+requires walking the stack and checking each stack frame for these potential
+actions.
+For exceptions, it must be possible to walk the stack frames in search of @try@
+statements to match and execute a handler. For termination exceptions, it must
+also be possible to unwind all stack frames from the throw to the matching
+catch, and each of these frames must be checked for cleanup actions. Stack
+walking is where most of the complexity and expense of exception handling
+appears.
+One of the most popular tools for stack management is libunwind, a low-level
+library that provides tools for stack walking, handler execution, and
+unwinding. What follows is an overview of all the relevant features of
+libunwind needed for this work, and how \CFA uses them to implement exception
+handling.
+\subsection{libunwind Usage}
+Libunwind, accessed through @unwind.h@ on most platforms, is a C library that
+provides \CC-style stack-unwinding. Its operation is divided into two phases:
+search and cleanup. The dynamic target search -- phase 1 -- is used to scan the
+stack and decide where unwinding should stop (but no unwinding occurs). The
+cleanup -- phase 2 -- does the unwinding and also runs any cleanup code.
+To use libunwind, each function must have a personality function and a Language
+Specific Data Area (LSDA). The LSDA has the unique information for each
+function to tell the personality function where a function is executing, its
+current stack frame, and what handlers should be checked. Theoretically, the
+LSDA can contain any information but conventionally it is a table with entries
+representing regions of the function and what has to be done there during
+unwinding. These regions are bracketed by the instruction pointer. If the
+instruction pointer is within a region's start/end, then execution is currently
+executing in that region. Regions are used to mark out the scopes of objects
+with destructors and try blocks.
+% Libunwind actually does very little, it simply moves down the stack from
+% function to function. Most of the actions are implemented by the personality
+% function which libunwind calls on every function. Since this is shared across
+% many functions or even every function in a language it will need a bit more
+% information.
+The GCC compilation flag @-fexceptions@ causes the generation of an LSDA and
+attaches its personality function. However, this
+flag only handles the cleanup attribute:
+\todo{Peter: What is attached? Andrew: It uses the .cfi\_personality directive
+and that's all I know.}
+\begin{cfa}
+void clean_up( int * var ) { ... }
+int avar __attribute__(( cleanup(clean_up) ));
+\end{cfa}
+which is used on a variable and specifies a function, in this case @clean_up@,
+run when the variable goes out of scope.
+The function is passed a pointer to the object being removed from the stack
+so it can be used to mimic destructors.
+However, this feature cannot be used to mimic @try@ statements as it cannot
+control the unwinding.
+\subsection{Personality Functions}
+Personality functions have a complex interface specified by libunwind. This
+section covers some of the important parts of the interface.
+A personality function can preform different actions depending on how it is
+called.
+\begin{lstlisting}[language=C,{moredelim=**[is][\color{red}]{@}{@}}]
+typedef _Unwind_Reason_Code (*@_Unwind_Personality_Fn@) (
+        _Unwind_Action @action@,
+        _Unwind_Exception_Class @exception_class@,
+        _Unwind_Exception * @exception@,
+        struct _Unwind_Context * @context@
+);
 \end{lstlisting}
+The return value, the reason code, is an enumeration of possible messages
+The @action@ argument is a bitmask of possible actions:
+\begin{enumerate}
+\item
+@_UA_SEARCH_PHASE@ specifies a search phase and tells the personality function
+to check for handlers. If there is a handler in a stack frame, as defined by
+the language, the personality function returns @_URC_HANDLER_FOUND@; otherwise
+it return @_URC_CONTINUE_UNWIND@.
+\item
+@_UA_CLEANUP_PHASE@ specifies a cleanup phase, where the entire frame is
+unwound and all cleanup code is run. The personality function does whatever
+cleanup the language defines (such as running destructors/finalizers) and then
+generally returns @_URC_CONTINUE_UNWIND@.
+\item
+\begin{sloppypar}
+@_UA_HANDLER_FRAME@ specifies a cleanup phase on a function frame that found a
+handler. The personality function must prepare to return to normal code
+execution and return @_URC_INSTALL_CONTEXT@.
+\end{sloppypar}
+\item
+@_UA_FORCE_UNWIND@ specifies a forced unwind call. Forced unwind only performs
+the cleanup phase and uses a different means to decide when to stop
+\see{\VRef{s:ForcedUnwind}}.
+\end{enumerate}
+The @exception_class@ argument is a copy of the
+\lstinline[language=C]|exception|'s @exception_class@ field.
+The \lstinline[language=C]|exception| argument is a pointer to the user
+provided storage object. It has two public fields, the exception class, which
+is actually just a number, identifying the exception handling mechanism that
+created it, and the cleanup function. The cleanup function is called if
+required by the exception.
+The @context@ argument is a pointer to an opaque type passed to helper
+functions called inside the personality function.
+The return value, @_Unwind_Reason_Code@, is an enumeration of possible messages
 that can be passed several places in libunwind. It includes a number of
 messages for special cases (some of which should never be used by the
 personality function) and error codes but unless otherwise noted the
+personality function should always return \codeC{_URC_CONTINUE_UNWIND}.
+The \codeC{version} argument is the verson of the implementation that is
+calling the personality function. At this point it appears to always be 1 and
+it will likely stay that way until a new version of the API is updated.
+The \codeC{action} argument is set of flags that tell the personality
+function when it is being called and what it must do on this invocation.
+The flags are as follows:
+\begin{itemize}
+\item\codeC{_UA_SEARCH_PHASE}: This flag is set whenever the personality
+function is called during the search phase. The personality function should
+decide if unwinding will stop in this function or not. If it does then the
+personality function should return \codeC{_URC_HANDLER_FOUND}.
+\item\codeC{_UA_CLEANUP_PHASE}: This flag is set whenever the personality
+function is called during the cleanup phase. If no other flags are set this
+means the entire frame will be unwound and all cleanup code should be run.
+\item\codeC{_UA_HANDLER_FRAME}: This flag is set during the cleanup phase
+on the function frame that found the handler. The personality function must
+prepare to return to normal code execution and return
+\codeC{_URC_INSTALL_CONTEXT}.
+\item\codeC{_UA_FORCE_UNWIND}: This flag is set if the personality function
+is called through a forced unwind call. Forced unwind only performs the
+cleanup phase and uses a different means to decide when to stop. See its
+section below.
+\end{itemize}
+The \codeC{exception_class} argument is a copy of the \codeC{exception}'s
+\codeC{exception_class} field.
+The \codeC{exception} argument is a pointer to the user provided storage
+object. It has two public fields, the exception class which is actually just
+a number that identifies the exception handling mechanism that created it and
+the other is the clean-up function. The clean-up function is called if the
+exception needs to
+The \codeC{context} argument is a pointer to an opaque type. This is passed
+to the many helper functions that can be called inside the personality
+function.
+personality function should always return @_URC_CONTINUE_UNWIND@.
 \subsection{Raise Exception}
+This could be considered the central function of libunwind. It preforms the
+two staged unwinding the library is built around and most of the rest of the
+interface of libunwind is here to support it. It's signature is as follows:
+\begin{lstlisting}
+Raising an exception is the central function of libunwind and it performs a
+two-staged unwinding.
+\begin{cfa}
 _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *);
+\end{cfa}
+First, the function begins the search phase, calling the personality function
+of the most recent stack frame. It continues to call personality functions
+traversing the stack from newest to oldest until a function finds a handler or
+the end of the stack is reached. In the latter case, raise exception returns
+@_URC_END_OF_STACK@.
+Second, when a handler is matched, raise exception continues onto the cleanup
+phase.
+Once again, it calls the personality functions of each stack frame from newest
+to oldest. This pass stops at the stack frame containing the matching handler.
+If that personality function has not install a handler, it is an error.
+If an error is encountered, raise exception returns either
+@_URC_FATAL_PHASE1_ERROR@ or @_URC_FATAL_PHASE2_ERROR@ depending on when the
+error occurred.
+\subsection{Forced Unwind}
+\label{s:ForcedUnwind}
+Forced Unwind is the other central function in libunwind.
+\begin{cfa}
+_Unwind_Reason_Code _Unwind_ForcedUnwind( _Unwind_Exception *,
+        _Unwind_Stop_Fn, void *);
+\end{cfa}
+It also unwinds the stack but it does not use the search phase. Instead another
+function, the stop function, is used to stop searching. The exception is the
+same as the one passed to raise exception. The extra arguments are the stop
+function and the stop parameter. The stop function has a similar interface as a
+personality function, except it is also passed the stop parameter.
+\begin{lstlisting}[language=C,{moredelim=**[is][\color{red}]{@}{@}}]
+typedef _Unwind_Reason_Code (*@_Unwind_Stop_Fn@)(
+        _Unwind_Action @action@,
+        _Unwind_Exception_Class @exception_class@,
+        _Unwind_Exception * @exception@,
+        struct _Unwind_Context * @context@,
+        void * @stop_parameter@);
 \end{lstlisting}
-When called the function begins the search phase, calling the personality
-function of the most recent stack frame. It will continue to call personality
-functions traversing the stack new-to-old until a function finds a handler or
-the end of the stack is reached. In the latter case raise exception will
-return with \codeC{_URC_END_OF_STACK}.
-Once a handler has been found raise exception continues onto the the cleanup
-phase. Once again it will call the personality functins of each stack frame
-from newest to oldest. This pass will stop at the stack frame that found the
-handler last time, if that personality function does not install the handler
-it is an error.
-If an error is encountered raise exception will return either
-\codeC{_URC_FATAL_PHASE1_ERROR} or \codeC{_URC_FATAL_PHASE2_ERROR} depending
-on when the error occured.
-\subsection{Forced Unwind}
-This is the second big function in libunwind. It also unwinds a stack but it
-does not use the search phase. Instead another function, the stop function,
-is used to decide when to stop.
-\begin{lstlisting}
-_Unwind_Reason_Code _Unwind_ForcedUnwind(
-    _Unwind_Exception *, _Unwind_Stop_Fn, void *);
-\end{lstlisting}
-The exception is the same as the one passed to raise exception. The extra
-arguments are the stop function and the stop parameter. The stop function has
-a similar interface as a personality function, except it is also passed the
-stop parameter.
-\begin{lstlisting}
-typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(
-    int version,
-    _Unwind_Action action,
-    _Unwind_Exception_Class exception_class,
-    _Unwind_Exception * exception,
-    struct _Unwind_Context * context,
-    void * stop_parameter);
-\end{lstlisting}
 The stop function is called at every stack frame before the personality
+function is called and then once more once after all frames of the stack have
+been unwound.
+Each time it is called the stop function should return \codeC{_URC_NO_REASON}
+or transfer control directly to other code outside of libunwind. The
+framework does not provide any assistance here.
+Its arguments are the same as the paired personality function.
+The actions \codeC{_UA_CLEANUP_PHASE} and \codeC{_UA_FORCE_UNWIND} are always
+set when it is called. By the official standard that is all but both GCC and
+Clang add an extra action on the last call at the end of the stack:
+\codeC{_UA_END_OF_STACK}.
+function is called and then once more after all frames of the stack are
+unwound.
+Each time it is called, the stop function should return @_URC_NO_REASON@ or
+transfer control directly to other code outside of libunwind. The framework
+does not provide any assistance here.
+\begin{sloppypar}
+Its arguments are the same as the paired personality function. The actions
+@_UA_CLEANUP_PHASE@ and @_UA_FORCE_UNWIND@ are always set when it is
+called. Beyond the libunwind standard, both GCC and Clang add an extra action
+on the last call at the end of the stack: @_UA_END_OF_STACK@.
+\end{sloppypar}
 \section{Exception Context}
 % Should I have another independent section?
 % There are only two things in it, top_resume and current_exception. How it is
+% stored changes depending on wheither or not the thread-library is linked.
+The exception context is a piece of global storage used to maintain data
+across different exception operations and to communicate between different
+components.
+Each stack has its own exception context. In a purely sequental program, using
+only core Cforall, there is only one stack and the context is global. However
+if the library \texttt{libcfathread} is linked then there can be multiple
+stacks so they will each need their own.
+To handle this code always gets the exception context from the function
+\codeC{this_exception_context}. The main exception handling code is in
+\texttt{libcfa} and that library also defines the function as a weak symbol
+so it acts as a default. Meanwhile in \texttt{libcfathread} the function is
+defined as a strong symbol that replaces it when the libraries are linked
+together.
+The version of the function defined in \texttt{libcfa} is very simple. It
+returns a pointer to a global static variable. With only one stack this
+global instance is associated with the only stack.
+The version of the function defined in \texttt{libcfathread} has to handle
+more as there are multiple stacks. The exception context is included as
+part of the per-stack data stored as part of coroutines. In the cold data
+section, stored at the base of each stack, is the exception context for that
+stack. The \codeC{this_exception_context} uses the concurrency library to get
+the current coroutine and through it the cold data section and the exception
+context.
+% stored changes depending on whether or not the thread-library is linked.
+The exception context is global storage used to maintain data across different
+exception operations and to communicate among different components.
+Each stack must have its own exception context. In a sequential \CFA program,
+there is only one stack with a single global exception-context. However, when
+the library @libcfathread@ is linked, there are multiple stacks where each
+needs its own exception context.
+General access to the exception context is provided by function
+@this_exception_context@. For sequential execution, this function is defined as
+a weak symbol in the \CFA system-library, @libcfa@. When a \CFA program is
+concurrent, it links with @libcfathread@, where this function is defined with a
+strong symbol replacing the sequential version.
+The sequential @this_exception_context@ returns a hard-coded pointer to the
+global execption context.
+The concurrent version adds the exception context to the data stored at the
+base of each stack. When @this_exception_context@ is called it retrieves the
+active stack and returns the address of the context saved there.
 \section{Termination}
 …
 % catches. Talk about GCC nested functions.
 Termination exceptions use libunwind quite heavily because it matches the
+intended use from \CPP exceptions very closely. The main complication is that
+since the \CFA compiler works by translating to C code it cannot generate the
 assembly to form the LSDA for try blocks or destructors.
+Termination exceptions use libunwind heavily because it matches the intended
+use from \CC exceptions closely. The main complication for \CFA is that the
+compiler generates C code, making it very difficult to generate the assembly to
+form the LSDA for try blocks or destructors.
 \subsection{Memory Management}
+The first step of termination is to copy the exception into memory managed by
+the exception system. Currently the system just uses malloc, without reserved
+memory or and ``small allocation" optimizations. The exception handling
+mechanism manages memory for the exception as well as memory for libunwind
+and the system's own per-exception storage.
+Exceptions are stored in variable sized block. The first component is a fixed
+sized data structure that contains the information for libunwind and the
+exception system. The second component is a blob of memory that is big enough
+to store the exception. Macros with pointer arthritic and type cast are
+used to move between the components or go from the embedded
+\codeC{_Unwind_Exception} to the entire node.
+All of these nodes are strung together in a linked list. One linked list per
+stack, with the head stored in the exception context. Within each linked list
+the most recently thrown exception is at the head and the older exceptions
+are further down the list. This list format allows exceptions to be thrown
+while a different exception is being handled. Only the exception at the head
+of the list is currently being handled, the other will wait for the
+exceptions before them to be removed.
+The virtual members in the exception's virtual table. The size of the
+exception, the copy function and the free function are all in the virtual
+table so they are decided per-exception type. The size and copy function are
+used right away when the exception is copied in to managed memory. After the
+exception is handled the free function is used to clean up the exception and
+then the entire node is passed to free.
+\subsection{Try Statements \& Catch Clauses}
+The try statements with termination handlers have a pretty complex conversion
+to compensate for the lack of assembly generation. Libunwind requires an LSDA
+(Language Specific Data Area) and personality function for a function to
+unwind across it. The LSDA in particular is hard to generate at the level of
+C which is what the \CFA compiler outputs so a work-around is used.
+This work around is a function called \codeC{__cfaehm_try_terminate} in the
+standard library. The contents of a try block and the termination handlers
+are converted into functions. These are then passed to the try terminate
+function and it calls them. This puts the try statements in their own
+functions so that no function has to deal with both termination handlers and
+destructors.
+This function has some custom embedded assembly that defines its personality
+function and LSDA. This is hand coded in C which is why there is only one
+version of it, the compiler has no capability to generate it. The personality
+function is structured so that it may be expanded, but really it only handles
+this one function. Notably it does not handle any destructors so the function
+is constructed so that it does need to run it.
+The first step of a termination raise is to copy the exception into memory
+managed by the exception system. Currently, the system uses @malloc@, rather
+than reserved memory or the stack top. The exception handling mechanism manages
+memory for the exception as well as memory for libunwind and the system's own
+per-exception storage.
+[Quick ASCII diagram to get started.]
+\begin{verbatim}
+Fixed Header  | _Unwind_Exception   <- pointer target
+              |
+              | Cforall storage
+              |
+Variable Body | the exception       <- fixed offset
+              V ...
+\end{verbatim}
+Exceptions are stored in variable-sized blocks.
+The first component is a fixed sized data structure that contains the
+information for libunwind and the exception system. The second component is an
+area of memory big enough to store the exception. Macros with pointer arthritic
+and type cast are used to move between the components or go from the embedded
+@_Unwind_Exception@ to the entire node.
+All of these nodes are linked together in a list, one list per stack, with the
+list head stored in the exception context. Within each linked list, the most
+recently thrown exception is at the head followed by older thrown
+exceptions. This format allows exceptions to be thrown, while a different
+exception is being handled. The exception at the head of the list is currently
+being handled, while other exceptions wait for the exceptions before them to be
+removed.
+The virtual members in the exception's virtual table provide the size of the
+exception, the copy function, and the free function, so they are specific to an
+exception type. The size and copy function are used immediately to copy an
+exception into managed memory. After the exception is handled the free function
+is used to clean up the exception and then the entire node is passed to free
+so the memory can be given back to the heap.
+\subsection{Try Statements and Catch Clauses}
+The try statement with termination handlers is complex because it must
+compensate for the lack of assembly-code generated from \CFA. Libunwind
+requires an LSDA and personality function for control to unwind across a
+function. The LSDA in particular is hard to mimic in generated C code.
+The workaround is a function called @__cfaehm_try_terminate@ in the standard
+library. The contents of a try block and the termination handlers are converted
+into functions. These are then passed to the try terminate function and it
+calls them.
+Because this function is known and fixed (and not an arbitrary function that
+happens to contain a try statement) this means the LSDA can be generated ahead
+of time.
+Both the LSDA and the personality function are set ahead of time using
+embedded assembly. This is handcrafted using C @asm@ statements and contains
+enough information for the single try statement the function repersents.
 The three functions passed to try terminate are:
 \begin{itemize}
 \item The try function: This function is the try block, all the code inside
 the try block is placed inside the try function. It takes no parameters and
+has no return value. This function is called during regular execution to run
 the try block.
+\item The match function: This function decides if this try statement should
+handle any given termination exception. It takes a pointer to the exception
+and returns 0 if the exception is not handled here. Otherwise the return value
+is the id of the handler that should handle the exception. It is called
+during the search phase.
+It is constructed from the conditional part of each handler. It runs each
+check in turn, first checking to see if the object
+\item The catch function: This function handles the exception. It takes a
+pointer to the exception and the handler's id and returns nothing. It is
+called after the clean-up phase.
+It is constructed by stitching together the bodies of each handler
+\end{itemize}
+All three are created with GCC nested functions. GCC nested functions can be
+used to create closures, functions that can refer to the state of other
+functions on the stack. This allows the functions to refer to the main
+function and all the variables in scope.
+These nested functions and all other functions besides
+\codeC{__cfaehm_try_terminate} in \CFA use the GCC personality function and
+the \texttt{-fexceptions} flag to generate the LSDA. This allows destructors
 to be implemented with the cleanup attribute.
+\begin{description}
+\item[try function:] This function is the try block, all the code inside the
+try block is placed inside the try function. It takes no parameters and has no
+return value. This function is called during regular execution to run the try
+block.
+\item[match function:] This function is called during the search phase and
+decides if a catch clause matches the termination exception. It is constructed
+from the conditional part of each handler and runs each check, top to bottom,
+in turn, first checking to see if the exception type matches and then if the
+condition is true. It takes a pointer to the exception and returns 0 if the
+exception is not handled here. Otherwise the return value is the id of the
+handler that matches the exception.
+\item[handler function:] This function handles the exception. It takes a
+pointer to the exception and the handler's id and returns nothing. It is called
+after the cleanup phase. It is constructed by stitching together the bodies of
+each handler and dispatches to the selected handler.
+\end{description}
+All three functions are created with GCC nested functions. GCC nested functions
+can be used to create closures, functions that can refer to the state of other
+functions on the stack. This approach allows the functions to refer to all the
+variables in scope for the function containing the @try@ statement. These
+nested functions and all other functions besides @__cfaehm_try_terminate@ in
+\CFA use the GCC personality function and the @-fexceptions@ flag to generate
+the LSDA. This allows destructors to be implemented with the cleanup attribute.
 \section{Resumption}
 % The stack-local data, the linked list of nodes.
+Resumption uses a list of nodes for its stack traversal. The head of the list
+is stored in the exception context. The nodes in the list just have a pointer
+Resumption simple to implement because there is no stack unwinding. The
+resumption raise uses a list of nodes for its stack traversal. The head of the
+list is stored in the exception context. The nodes in the list have a pointer
 to the next node and a pointer to the handler function.
+The on a resumption throw the this list is traversed. At each node the
+handler function is called and is passed the exception by pointer. It returns
+true if the exception was handled and false otherwise.
+The handler function does both the matching and catching. It tries each
+the condition of \codeCFA{catchResume} in order, top-to-bottom and until it
+finds a handler that matches. If no handler matches then the function returns
+false. Otherwise the matching handler is run, if it completes successfully
+the function returns true. Rethrows, through the \codeCFA{throwResume;}
+statement, cause the function to return true.
+\subsection{Libunwind Compatibility}
+Resumption does not use libunwind for two simple reasons. The first is that
+it does not have to unwind anything so would never need to use the clean-up
+phase. Still the search phase could be used to make it free to enter or exit
+a try statement with resumption handlers in the same way termination handlers
+are for the same trade off in the cost of the throw. This is where the second
+reason comes in, there is no way to return from a search without installing
+a handler or raising an error.
+Although work arounds could be created none seemed to be worth it for the
+prototype. This implementation has no difference in behaviour and is much
+simpler.
+A resumption raise traverses this list. At each node the handler function is
+called, passing the exception by pointer. It returns true if the exception is
+handled and false otherwise.
+The handler function does both the matching and handling. It computes the
+condition of each @catchResume@ in top-to-bottom order, until it finds a
+handler that matches. If no handler matches then the function returns
+false. Otherwise the matching handler is run; if it completes successfully, the
+function returns true. Rethrowing, through the @throwResume;@ statement,
+causes the function to return true.
+% Recursive Resumption Stuff:
+Search skipping \see{\VPageref{p:searchskip}}, which ignores parts of the stack
+already examined, is accomplished by updating the front of the list as the
+search continues. Before the handler at a node is called the head of the list
+is updated to the next node of the current node. After the search is complete,
+successful or not, the head of the list is reset.
+This mechanism means the current handler and every handler that has already
+been checked are not on the list while a handler is run. If a resumption is
+thrown during the handling of another resumption the active handlers and all
+the other handler checked up to this point are not checked again.
+This structure also supports new handler added while the resumption is being
+handled. These are added to the front of the list, pointing back along the
+stack -- the first one points over all the checked handlers -- and the ordering
+is maintained.
+\label{p:zero-cost}
+Note, the resumption implementation has a cost for entering/exiting a @try@
+statement with @catchResume@ clauses, whereas a @try@ statement with @catch@
+clauses has zero-cost entry/exit. While resumption does not need the stack
+unwinding and cleanup provided by libunwind, it could use the search phase to
+providing zero-cost enter/exit using the LSDA. Unfortunately, there is no way
+to return from a libunwind search without installing a handler or raising an
+error. Although workarounds might be possible, they are beyond the scope of
+this thesis. The current resumption implementation has simplicity in its
+favour.
 % Seriously, just compare the size of the two chapters and then consider
 % that unwind is required knowledge for that chapter.
 …
 \section{Finally}
 % Uses destructors and GCC nested functions.
+Finally clauses are a simple decomposition to some of the existing features.
+The code in the block is placed into a GCC nested function with a unique name,
+no arguments or return values. This nested function is then set as the
+clean-up function of an empty object that is declared at the beginning of a
+block placed around the contexts of the try statement.
+Finally clauses is placed into a GCC nested-function with a unique name, and no
+arguments or return values. This nested function is then set as the cleanup
+function of an empty object that is declared at the beginning of a block placed
+around the context of the associated @try@ statement.
 The rest is handled by GCC. The try block and all handlers are inside the
 block. When they are complete control exits the block and the empty object
 is cleaned up, which runs the function that contains the finally code.
+block. At completion, control exits the block and the empty object is cleaned
+up, which runs the function that contains the finally code.
 \section{Cancellation}
 …
 Cancellation also uses libunwind to do its stack traversal and unwinding,
 however it uses a different primary function \codeC{_Unwind_ForcedUnwind}.
 Details of its interface can be found in the unwind section.
 The first step of cancellation is to find the stack was cancelled and which
 type of stack it is. Luckily the threads library stores the main thread
 pointer and the current thread pointer and every thread stores a pointer to
+however it uses a different primary function @_Unwind_ForcedUnwind@. Details
+of its interface can be found in the \VRef{s:ForcedUnwind}.
+The first step of cancellation is to find the cancelled stack and its type:
+coroutine or thread. Fortunately, the thread library stores the main thread
+pointer and the current thread pointer, and every thread stores a pointer to
 its main coroutine and the coroutine it is currently executing.
+So if the the current thread's main and current coroutine do not match, it is
+a coroutine cancellation. Otherwise if the main and current thread do not
+match, it is a thread cancellation. Otherwise it is a main thread
+cancellation.
+However if the threading library is not linked then execution must be on the
+main stack as that is the only one that exists. So the entire check is skipped
+using the linker and weak symbols. Instead the main thread cancellation is
+unconditionally preformed.
+Regardless of how they are choosen afterwords the stop function and the stop
+parameter are passed to the forced unwind functon. The general pattern of all
+three stop functions is the same, they continue unwinding until the end of
+stack when they do there primary work.
+Main stack cancellation it is very simple. The ``transfer" is just an abort,
+the program stops executing.
+The coroutine cancellation stores the exception on the coroutine and then
+does a coroutine context switch. The rest is handled inside resume. Every time
+control returns from a resumed thread there is a check to see if it is
+cancelled. If it is the exception is retrieved and the CoroutineCancelled
+exception is constructed and loaded. It is then thrown as a regular exception
+with the default handler coming from the context of the resumption call.
+The thread cancellation stores the exception on the thread's main stack and
+then returns to the scheduler. The rest is handled by the joiner. The wait
+for the joined thread to finish works the same but after that it checks
+to see if there was a cancellation. If there was the exception is retrieved
+and the ThreadCancelled exception is constructed. The default handler is
+passed in as a function pointer. If it is null (as it is for the
+auto-generated joins on destructor call) it a default is used that simply
+calls abort; which gives the required handling on implicate join.
+So if the active thread's main and current coroutine are the same. If they
+are then the current stack is a thread stack, otherwise it is a coroutine
+stack. If it is a thread stack then an equality check with the stored main
+thread pointer and current thread pointer is enough to tell if the current
+thread is the main thread or not.
+However, if the threading library is not linked, the sequential execution is on
+the main stack. Hence, the entire check is skipped because the weak-symbol
+function is loaded. Therefore, a main thread cancellation is unconditionally
+performed.
+Regardless of how the stack is chosen, the stop function and parameter are
+passed to the forced-unwind function. The general pattern of all three stop
+functions is the same: they continue unwinding until the end of stack when they
+do there primary work.
+For main stack cancellation, the transfer is just a program abort.
+For coroutine cancellation, the exception is stored on the coroutine's stack,
+and the coroutine context switches to its last resumer. The rest is handled on
+the backside of the resume, which check if the resumed coroutine is
+cancelled. If cancelled, the exception is retrieved from the resumed coroutine,
+and a @CoroutineCancelled@ exception is constructed and loaded with the
+cancelled exception. It is then resumed as a regular exception with the default
+handler coming from the context of the resumption call.
+For thread cancellation, the exception is stored on the thread's main stack and
+then context switched to the scheduler. The rest is handled by the thread
+joiner. When the join is complete, the joiner checks if the joined thread is
+cancelled. If cancelled, the exception is retrieved and the joined thread, and
+a @ThreadCancelled@ exception is constructed and loaded with the cancelled
+exception. The default handler is passed in as a function pointer. If it is
+null (as it is for the auto-generated joins on destructor call), the default is
+used, which is a program abort.
+%; which gives the required handling on implicate join.

doc/theses/andrew_beach_MMath/thesis-frontpgs.tex

-              r342af53
+              r8e4aa05
         A thesis \\
         presented to the University of Waterloo \\
+        presented to the University of Waterloo \\
         in fulfillment of the \\
         thesis requirement for the degree of \\
 …
 \cleardoublepage
 %----------------------------------------------------------------------
 % EXAMINING COMMITTEE (Required for Ph.D. theses only)
 …
 \begin{center}\textbf{Examining Committee Membership}\end{center}
   \noindent
+The following served on the Examining Committee for this thesis. The decision of the Examining Committee is by majority vote.
+  \bigskip
+  \noindent
+\begin{tabbing}
+Internal-External Member: \=  \kill % using longest text to define tab length
+External Examiner: \>  Bruce Bruce \\
+The following served on the Examining Committee for this thesis. The decision
+of the Examining Committee is by majority vote.
+  \bigskip
+  \noindent
+\begin{tabbing}
+Internal-External Member: \=  \kill % using longest text to define tab length
+External Examiner: \>  Bruce Bruce \\
 \> Professor, Dept. of Philosophy of Zoology, University of Wallamaloo \\
 \end{tabbing}
   \bigskip
+\end{tabbing}
+  \bigskip
   \noindent
 \begin{tabbing}
 …
 \end{tabbing}
   \bigskip
   \noindent
   \begin{tabbing}
 …
 \end{tabbing}
   \bigskip
   \noindent
 \begin{tabbing}
 …
 \end{tabbing}
   \bigskip
   \noindent
 \begin{tabbing}
 …
   % December 13th, 2006.  It is designed for an electronic thesis.
   \noindent
+I hereby declare that I am the sole author of this thesis. This is a true copy of the thesis, including any required final revisions, as accepted by my examiners.
+  \bigskip
+I hereby declare that I am the sole author of this thesis. This is a true copy
+of the thesis, including any required final revisions, as accepted by my
+examiners.
+  \bigskip
   \noindent
 I understand that my thesis may be made electronically available to the public.

doc/theses/andrew_beach_MMath/thesis.tex

-              r342af53
+              r8e4aa05
 % FRONT MATERIAL
 %----------------------------------------------------------------------
 \input{thesis-frontpgs}
+\input{thesis-frontpgs}
 %----------------------------------------------------------------------
 …
 A \gls{computer} could compute $\pi$ all day long. In fact, subsets of digits
 of $\pi$'s decimal approximation would make a good source for psuedo-random
 vectors, \gls{rvec} .
+vectors, \gls{rvec} .
 %----------------------------------------------------------------------
 …
 \begin{itemize}
 \item A well-prepared PDF should be
+\item A well-prepared PDF should be
   \begin{enumerate}
     \item Of reasonable size, {\it i.e.} photos cropped and compressed.
     \item Scalable, to allow enlargment of text and drawings.
   \end{enumerate}
+    \item Scalable, to allow enlargment of text and drawings.
+  \end{enumerate}
 \item Photos must be bit maps, and so are not scaleable by definition. TIFF and
 BMP are uncompressed formats, while JPEG is compressed. Most photos can be
 compressed without losing their illustrative value.
 \item Drawings that you make should be scalable vector graphics, \emph{not}
+\item Drawings that you make should be scalable vector graphics, \emph{not}
 bit maps. Some scalable vector file formats are: EPS, SVG, PNG, WMF. These can
 all be converted into PNG or PDF, that pdflatex recognizes. Your drawing
 package probably can export to one of these formats directly. Otherwise, a
 common procedure is to print-to-file through a Postscript printer driver to
 create a PS file, then convert that to EPS (encapsulated PS, which has a
 bounding box to describe its exact size rather than a whole page).
+all be converted into PNG or PDF, that pdflatex recognizes. Your drawing
+package probably can export to one of these formats directly. Otherwise, a
+common procedure is to print-to-file through a Postscript printer driver to
+create a PS file, then convert that to EPS (encapsulated PS, which has a
+bounding box to describe its exact size rather than a whole page).
 Programs such as GSView (a Ghostscript GUI) can create both EPS and PDF from
 PS files. Appendix~\ref{AppendixA} shows how to generate properly sized Matlab
 plots and save them as PDF.
 \item It's important to crop your photos and draw your figures to the size that
 you want to appear in your thesis. Scaling photos with the
 includegraphics command will cause loss of resolution. And scaling down
+you want to appear in your thesis. Scaling photos with the
+includegraphics command will cause loss of resolution. And scaling down
 drawings may cause any text annotations to become too small.
 \end{itemize}
 For more information on \LaTeX\, see the uWaterloo Skills for the
 Academic Workplace \href{https://uwaterloo.ca/information-systems-technology/services/electronic-thesis-preparation-and-submission-support/ethesis-guide/creating-pdf-version-your-thesis/creating-pdf-files-using-latex/latex-ethesis-and-large-documents}{course notes}.
+Academic Workplace \href{https://uwaterloo.ca/information-systems-technology/services/electronic-thesis-preparation-and-submission-support/ethesis-guide/creating-pdf-version-your-thesis/creating-pdf-files-using-latex/latex-ethesis-and-large-documents}{course notes}.
 \footnote{
 Note that while it is possible to include hyperlinks to external documents,
 it is not wise to do so, since anything you can't control may change over time.
 It \emph{would} be appropriate and necessary to provide external links to
 additional resources for a multimedia ``enhanced'' thesis.
 But also note that if the \package{hyperref} package is not included,
 as for the print-optimized option in this thesis template, any \cmmd{href}
+it is not wise to do so, since anything you can't control may change over time.
+It \emph{would} be appropriate and necessary to provide external links to
+additional resources for a multimedia ``enhanced'' thesis.
+But also note that if the \package{hyperref} package is not included,
+as for the print-optimized option in this thesis template, any \cmmd{href}
 commands in your logical document are no longer defined.
 A work-around employed by this thesis template is to define a dummy
 \cmmd{href} command (which does nothing) in the preamble of the document,
 before the \package{hyperref} package is included.
+\cmmd{href} command (which does nothing) in the preamble of the document,
+before the \package{hyperref} package is included.
 The dummy definition is then redifined by the
 \package{hyperref} package when it is included.
 …
 The classic book by Leslie Lamport \cite{lamport.book}, author of \LaTeX , is
 worth a look too, and the many available add-on packages are described by
+worth a look too, and the many available add-on packages are described by
 Goossens \textit{et al} \cite{goossens.book}.
 …
 Export Setup button in the figure Property Editor.
 \section{From the Command Line}
+\section{From the Command Line}
 All figure properties can also be manipulated from the command line. Here's an
 example:
+example:
 \begin{verbatim}
 x=[0:0.1:pi];

doc/theses/andrew_beach_MMath/unwinding.tex

-              r342af53
+              r8e4aa05
 \chapter{Unwinding in \CFA}
+Stack unwinding is the process of removing things from the stack. Within
+functions and on function return this is handled directly by the code in the
+function itself as it knows exactly what is on the stack just from the
+current location in the function. Unwinding across stack frames means that it
+is no longer knows exactly what is on the stack or even how much of the stack
+needs to be removed.
+Stack unwinding is the process of removing stack frames (activations) from the
+stack. On function entry and return, unwinding is handled directly by the code
+embedded in the function. Usually, the stack-frame size is known statically
+based on parameters and local variable declarations.  For dynamically-sized
+local variables, a runtime computation is necessary to know the frame
+size. Finally, a function's frame-size may change during execution as local
+variables (static or dynamic sized) go in and out of scope.
+Allocating/deallocating stack space is usually an $O(1)$ operation achieved by
+bumping the hardware stack-pointer up or down as needed.
+Even this is fairly simple if nothing needs to happen when the stack unwinds.
+Traditional C can unwind the stack by saving and restoring state (with
+\codeC{setjmp} \& \codeC{longjmp}). However many languages define actions that
+have to be taken when something is removed from the stack, such as running
+a variable's destructor or a \codeCFA{try} statement's \codeCFA{finally}
+clause. Handling this requires walking the stack going through each stack
 frame.
+Unwinding across multiple stack frames is more complex because individual stack
+management code associated with each frame is bypassed. That is, the location
+of a function's frame code is largely unknown and dispersed throughout the
+function, hence the current stack-frame size managed by that code is also
+unknown. Hence, code unwinding across frames does not have direct knowledge
+about what is on the stack, and hence, how much of the stack needs to be
+removed.
+For exceptions, this means everything from the point the exception is raised
+to the point it is caught, while checking each frame for handlers during the
+stack walk to find out where it should be caught. This is where the most of
+the expense and complexity of exception handling comes from.
+The traditional unwinding mechanism for C is implemented by saving a snap-shot
+of a function's state with @setjmp@ and restoring that snap-shot with
+@longjmp@. This approach bypasses the need to know stack details by simply
+reseting to a snap-shot of an arbitrary but existing function frame on the
+stack. It is up to the programmer to ensure the snap-shot is valid when it is
+reset, making the code fragile with potential errors that are difficult to
+debug because the stack becomes corrupted.
+To do all of this we use libunwind, a low level library that provides tools
+for stack walking and stack unwinding. What follows is an overview of all the
+relivant features of libunwind and then how \CFA uses them to implement its
+exception handling.
+However, many languages define cleanup actions that have to be taken when
+something is deallocated from the stack or blocks end, such as running a
+variable's destructor or a @try@ statement's @finally@ clause. Handling these
+mechanisms requires walking the stack and checking each stack frame for these
+potential actions.
+For exceptions, it must be possible to walk the stack frames in search of try
+statements with handlers to perform exception matching. For termination
+exceptions, it must be possible to unwind all stack frames from the throw to
+the matching catch, and each of these frames must be checked for cleanup
+actions. Stack walking is where the most of the complexity and expense of
+exception handling comes from.
+One of the most popular tools for stack management is libunwind, a low level
+library that provides tools for stack walking and unwinding. What follows is an
+overview of all the relevant features of libunwind and how \CFA uses them to
+implement its exception handling.
 \section{libunwind Usage}
+\CFA uses two primary functions in libunwind to create most of its
+exceptional control-flow: \codeC{_Unwind_RaiseException} and
+\codeC{_Unwind_ForcedUnwind}.
+Their operation is divided into two phases: search and clean-up. The search
+phase -- phase 1 -- is used to scan the stack but not unwinding it. The
+clean-up phase -- phase 2 -- is used for unwinding.
+\CFA uses two primary functions in libunwind to create most of its exceptional
+control-flow: @_Unwind_RaiseException@ and @_Unwind_ForcedUnwind@.  Their
+operation is divided into two phases: search and clean-up. The search phase --
+phase 1 -- is used to scan the stack but not unwinding it. The clean-up phase
+-- phase 2 -- is used for unwinding.
 The raise-exception function uses both phases. It starts by searching for a
 …
 A personality function performs three tasks, although not all have to be
 present. The tasks performed are decided by the actions provided.
+\codeC{_Unwind_Action} is a bitmask of possible actions and an argument of
 this type is passed into the personality function.
+@_Unwind_Action@ is a bitmask of possible actions and an argument of this type
+is passed into the personality function.
 \begin{itemize}
+\item\codeC{_UA_SEARCH_PHASE} is passed in search phase and tells the
+personality function to check for handlers. If there is a handler in this
+stack frame, as defined by the language, the personality function should
+return \codeC{_URC_HANDLER_FOUND}. Otherwise it should return
+\codeC{_URC_CONTINUE_UNWIND}.
+\item\codeC{_UA_CLEANUP_PHASE} is passed in during the clean-up phase and
+means part or all of the stack frame is removed. The personality function
+should do whatever clean-up the language defines
+(such as running destructors/finalizers) and then generally returns
+\codeC{_URC_CONTINUE_UNWIND}.
+\item\codeC{_UA_HANDLER_FRAME} means the personality function must install
+a handler. It is also passed in during the clean-up phase and is in addition
+to the clean-up action. libunwind provides several helpers for the personality
+function here. Once it is done, the personality function must return
+\codeC{_URC_INSTALL_CONTEXT}.
+\item
+\begin{sloppypar}
+@_UA_SEARCH_PHASE@ is passed in for the search phase and tells the personality
+function to check for handlers. If there is a handler in a stack frame, as
+defined by the language, the personality function returns @_URC_HANDLER_FOUND@;
+otherwise it return @_URC_CONTINUE_UNWIND@.
+\end{sloppypar}
+\item
+@_UA_CLEANUP_PHASE@ is passed in during the clean-up phase and means part or
+all of the stack frame is removed. The personality function does whatever
+clean-up the language defines (such as running destructors/finalizers) and then
+generally returns @_URC_CONTINUE_UNWIND@.
+\item
+@_UA_HANDLER_FRAME@ means the personality function must install a handler. It
+is also passed in during the clean-up phase and is in addition to the clean-up
+action. libunwind provides several helpers for the personality function. Once
+it is done, the personality function returns @_URC_INSTALL_CONTEXT@.
 \end{itemize}
 The personality function is given a number of other arguments. Some are for
 compatability and there is the \codeC{struct _Unwind_Context} pointer which
 passed to many helpers to get information about the current stack frame.
+The personality function is given a number of other arguments. Some arguments
+are for compatibility, and there is the @struct _Unwind_Context@ pointer which
+is passed to many helpers to get information about the current stack frame.
+Forced-unwind only performs the clean-up phase. It takes three arguments:
+a pointer to the exception, a pointer to the stop function and a pointer to
+the stop parameter. It does most of the same things as phase two of
+raise-exception but with some extras.
+The first it passes in an extra action to the personality function on each
+stack frame, \codeC{_UA_FORCE_UNWIND}, which means a handler cannot be
+For cancellation, forced-unwind only performs the clean-up phase. It takes
+three arguments: a pointer to the exception, a pointer to the stop function and
+a pointer to the stop parameter. It does most of the same actions as phase two
+of raise-exception but passes in an extra action to the personality function on
+each stack frame, @_UA_FORCE_UNWIND@, which means a handler cannot be
 installed.
+The big change is that forced-unwind calls the stop function. Each time it
 steps into a frame, before calling the personality function, it calls the
 stop function. The stop function receives all the same arguments as the
 personality function will and the stop parameter supplied to forced-unwind.
+As well, forced-unwind calls the stop function each time it steps into a frame,
+before calling the personality function. The stop function receives all the
+same arguments as the personality function and the stop parameter supplied to
+forced-unwind.
 The stop function is called one more time at the end of the stack after all
 stack frames have been removed. By the standard API this is marked by setting
+stack frames have been removed. The standard API marks this frame by setting
 the stack pointer inside the context passed to the stop function. However both
 GCC and Clang add an extra action for this case \codeC{_UA_END_OF_STACK}.
+GCC and Clang add an extra action for this case @_UA_END_OF_STACK@.
+Each time function the stop function is called it can do one or two things.
+When it is not the end of the stack it can return \codeC{_URC_NO_REASON} to
+continue unwinding.
+Each time the stop function is called, it can do one or two things.  When it is
+not the end of the stack it can return @_URC_NO_REASON@ to continue unwinding.
 % Is there a reason that NO_REASON is used instead of CONTINUE_UNWIND?
+Its only other option is to use its own means to transfer control elsewhere
+and never return to its caller. It may always do this and no additional tools
 are provided to do it.
+The other option is to use some other means to transfer control elsewhere and
+never return to its caller. libunwind provides no additional tools for
+alternate transfers of control.
 \section{\CFA Implementation}
 To use libunwind, \CFA provides several wrappers, its own storage,
 personality functions, and a stop function.
+To use libunwind, \CFA provides several wrappers, its own storage, personality
+functions, and a stop function.
 The wrappers perform three tasks: set-up, clean-up and controlling the
 …
 The core control code is called every time a throw -- after set-up -- or
 re-throw is run. It uses raise-exception to search for a handler and to run it
 if one is found. If no handler is found and raise-exception returns then
+if one is found. If no handler is found and raise-exception returns, then
 forced-unwind is called to run all destructors on the stack before terminating
 the process.
 The stop function is very simple. It checks the end of stack flag to see if
+it is finished unwinding. If so, it calls \codeC{exit} to end the process,
 otherwise it returns with no-reason to continue unwinding.
+The stop function is simple. It checks for the end of stack flag to see if
+unwinding is finished. If so, it calls @exit@ to end the process, otherwise it
+returns with no-reason to continue unwinding.
 % Yeah, this is going to have to change.
 The personality routine is more complex because it has to obtain information
 about the function by scanning the LSDA (Language Specific Data Area). This
+about the function by scanning the Language Specific Data Area (LSDA). This
 step allows a single personality function to be used for multiple functions and
 let that personaliity function figure out exactly where in the function
 execution was, what is currently in the stack frame and what handlers should
 be checked.
+lets that personality function figure out exactly where in the function
+execution is, what is currently in the stack frame, and what handlers should be
+checked.
 % Not that we do that yet.
+However, generating the LSDA is difficult. It requires knowledge about the
 location of the instruction pointer and stack layout, which varies with
+compiler and optimization levels. So for frames where there are only
+destructors, GCC's attribute cleanup with the \texttt{-fexception} flag is
 sufficient to handle unwinding.
+It is also necessary to generate the LSDA, which is difficult. It requires
+knowledge about the location of the instruction pointer and stack layout, which
+varies with compiler and optimization levels. Fortunately, for frames where
+there are only destructors, GCC's attribute cleanup with the @-fexception@ flag
+is sufficient to handle unwinding.
 The only functions that require more than that are those that contain
+\codeCFA{try} statements. A \codeCFA{try} statement has a \codeCFA{try}
+clause, some number of \codeCFA{catch} clauses and \codeCFA{catchResume}
 clauses and may have a \codeCFA{finally} clause. Of these only \codeCFA{try}
+statements with \codeCFA{catch} clauses need to be transformed and only they
 and the \codeCFA{try} clause are involved.
+The only functions that require more information are those containing @try@
+statements. Specifically, only @try@ statements with @catch@ clauses need to be
+transformed.  The @try@ statement is converted into a series of closures that
+can access other parts of the function according to scoping rules but can be
+passed around. The @catch@ clauses are converted into two functions: the match
+function and the handler function.
+The \codeCFA{try} statement is converted into a series of closures which can
+access other parts of the function according to scoping rules but can be
+passed around. The \codeCFA{try} clause is converted into the try functions,
+almost entirely unchanged. The \codeCFA{catch} clauses are converted into two
+functions; the match function and the catch function.
+Together the match function and the catch function form the code that runs when
+an exception passes out of the guarded block for a try statement. The match
+function is used during the search phase: it is passed an exception and checks
+each handler to see if the raised exception matches the handler exception. It
+returns an index that represents which handler matched or there is no
+match. The catch function is used during the clean-up phase, it is passed an
+exception and the index of a handler. It casts the exception to the exception
+type declared in that handler and then runs the handler's body.
+Together the match function and the catch function form the code that runs
+when an exception passes out of a try block. The match function is used during
+the search phase, it is passed an exception and checks each handler to see if
+it will handle the exception. It returns an index that repersents which
+handler matched or that none of them did. The catch function is used during
+the clean-up phase, it is passed an exception and the index of a handler. It
+casts the exception to the exception type declared in that handler and then
+runs the handler's body.
+These three functions are passed to \codeC{try_terminate}. This is an
+These three functions are passed to @try_terminate@, which is an
 % Maybe I shouldn't quote that, it isn't its actual name.
 internal hand-written function that has its own personality function and
 custom assembly LSD does the exception handling in \CFA. During normal
 execution all this function does is call the try function and then return.
 It is only when exceptions are thrown that anything interesting happens.
+internal hand-written function that has its own personality function and custom
+assembly LSDA for doing the exception handling in \CFA. During normal
+execution, this function calls the try function and then return.  It is only
+when exceptions are thrown that anything interesting happens.
 During the search phase the personality function gets the pointer to the match
 function and calls it. If the match function returns a handler index the
+function and calls it. If the match function returns a handler index, the
 personality function saves it and reports that the handler has been found,
 otherwise unwinding continues.
+During the clean-up phase the personality function only does anything if the
+handler was found in this frame. If it was then the personality function
 installs the handler, which is setting the instruction pointer in
+\codeC{try_terminate} to an otherwise unused section that calls the catch
+function, passing it the current exception and handler index.
 \codeC{try_terminate} returns as soon as the catch function returns.
+otherwise unwinding continues.  During the clean-up phase, the personality
+function only performs an action, when a handler is found in a frame. For each
+found frame, the personality function installs the handler, which sets the
+instruction pointer in @try_terminate@ to an otherwise unused section that
+calls the catch function, passing it the current exception and handler index.
+@try_terminate@ returns as soon as the catch function returns.  At this point
+control has returned to normal control flow.
+At this point control has returned to normal control flow.
+\PAB{Maybe a diagram would be helpful?}

doc/theses/fangren_yu_COOP_F20/Report.tex

-              r342af53
+              r8e4aa05
 \usepackage[usenames]{color}
 \input{common}                                          % common CFA document macros
 \usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
 \usepackage{breakurl}
 \urlstyle{sf}
 …
 \renewcommand{\subsectionmark}[1]{\markboth{\thesubsection\quad #1}{\thesubsection\quad #1}}
 \pagenumbering{roman}
 \linenumbers                                            % comment out to turn off line numbering
+%\linenumbers                                            % comment out to turn off line numbering
 \maketitle
 \pdfbookmark[1]{Contents}{section}
+\tableofcontents
+\clearpage
 \thispagestyle{plain}
 \pagenumbering{arabic}
 \begin{abstract}
+\CFA is an evolutionary, non-object-oriented extension of the C programming language, featuring a parametric type-system, and is currently under active development. The reference compiler for the \CFA language, @cfa-cc@, has some of its major components dated back to the early 2000s, which are based on inefficient data structures and algorithms. This report introduces improvements targeting the expression resolution algorithm, suggested by a recent prototype experiment on a simplified model, which are implemented in @cfa-cc@ to support the full \CFA language. These optimizations speed up the compiler by a factor of 20 across the existing \CFA codebase, bringing the compilation time of a mid-sized \CFA source file down to the 10-second level. A few problem cases derived from realistic code examples are analyzed in detail, with proposed solutions. This work is a critical step in the \CFA project development to achieve its eventual goal of being used alongside C for large software systems.
 \end{abstract}
+\clearpage
+\section*{Acknowledgements}
+\begin{sloppypar}
+I would like to thank everyone in the \CFA team for their contribution towards this project. Programming language design and development is a tough subject and requires a lot of teamwork. Without the collaborative efforts from the team, this project could not have been a success. Specifically, I would like to thank Andrew Beach for introducing me to the \CFA codebase, Thierry Delisle for maintaining the test and build automation framework, Michael Brooks for providing example programs of various experimental language and type system features, and most importantly, Professor Martin Karsten for recommending me to the \CFA team, and my supervisor, Professor Peter Buhr for encouraging me to explore deeply into intricate compiler algorithms. Finally, I gratefully acknowledge the help from Aaron Moss, former graduate from the team and the author of the precedent thesis work, to participate in the \CFA team's virtual conferences and email correspondence, and provide many critical arguments and suggestions. 2020 had been an unusually challenging year for everyone and we managed to keep a steady pace.
+\end{sloppypar}
+\clearpage
+\tableofcontents
+\clearpage
 \section{Introduction}
+\section{Completed work}
+\CFA language, developed by the Programming Language Group at the University of Waterloo, has a long history, with the initial language design in 1992 by Glen Ditchfield~\cite{Ditchfield92} and the first proof-of-concept compiler built in 2003 by Richard Bilson~\cite{Bilson03}. Many new features have been added to the language over time, but the core of \CFA's type-system --- parametric functions introduced by the @forall@ clause (hence the name of the language) providing parametric overloading --- remains mostly unchanged.
+The current \CFA reference compiler, @cfa-cc@, is designed using the visitor pattern~\cite{vistorpattern} over an abstract syntax tree (AST), where multiple passes over the AST modify it for subsequent passes. @cfa-cc@ still includes many parts taken directly from the original Bilson implementation, which served as the starting point for this enhancement work to the type system. Unfortunately, the prior implementation did not provide the efficiency required for the language to be practical: a \CFA source file of approximately 1000 lines of code can take multiple minutes to compile. The cause of the problem is that the old compiler used inefficient data structures and algorithms for expression resolution, which involved significant copying and redundant work.
+This report presents a series of optimizations to the performance-critical parts of the resolver, with a major rework of the compiler data-structures using a functional-programming approach to reduce memory complexity. The improvements were suggested by running the compiler builds with a performance profiler against the \CFA standard-library source-code and a test suite to find the most underperforming components in the compiler algorithm.
+The \CFA team endorses a pragmatic philosophy that focuses on practical implications of language design and implementation rather than theoretical limits. In particular, the compiler is designed to be expressive with respect to code reuse while maintaining type safety, but compromise theoretical soundness in extreme corner cases. However, when these corner cases do appear in actual usage, they need to be thoroughly investigated. A case-by-case analysis is presented for several of these corner cases, some of which point to certain weaknesses in the language design with solutions proposed based on experimental results.
+\section{AST restructuring}
 \subsection{Memory model with sharing}
+A major rework of the abstract syntax tree (AST) data structure in the compiler is completed as the first step of the project. The majority of work were documented in the reference manual of the compiler~\cite{cfa-cc}. To summarize:
+\begin{itemize}
+\item
+AST nodes (and therefore subtrees) can be shared without copying when reused.
+\item
+Modifications apply the functional programming principle, making copies for local changes without affecting the original data shared by other owners. In-place mutations are permitted as a special case when sharing does not happen. The logic is implemented by reference counting.
+\item
+Memory allocation and freeing are performed automatically using smart pointers.
+\end{itemize}
+The resolver algorithm designed for overload resolution naturally introduces a significant amount of reused intermediate representations, especially in the following two places:
+\begin{itemize}
+\item
+Function overload candidates are computed by combining the argument candidates bottom-up, with many of them being a common term. For example, if $n$ overloads of a function @f@ all take an integer for the first parameter but different types for the second (@f( int, int )@, @f( int, double )@, etc.) the first term is reused $n$ times for each of the generated candidate expressions. This effect is particularly bad for deep expression trees.
+\item
+In the unification algorithm and candidate elimination step, actual types are obtained by substituting the type parameters by their bindings. Let $n$ be the complexity (\ie number of nodes in representation) of the original type, $m$ be the complexity of bound type for parameters, and $k$ be the number of occurrences of type parameters in the original type. If everything needs to be deep-copied, the substitution step takes $O(n+mk)$ time and memory, while using shared nodes it is reduced to $O(n)$ time and $O(k)$ memory.
+\end{itemize}
+One of the worst examples for the old compiler is a long chain of I/O operations
+\begin{cfa}
+sout | 1 | 2 | 3 | 4 | ...
+\end{cfa}
+The pipe operator is overloaded by \CFA I/O library for every primitive type in C language, as well as I/O manipulators defined by the library. In total there are around 50 overloads for the output stream operation. On resolving the $n$-th pipe operator in the sequence, the first term, which is the result of sub-expression containing $n-1$ pipe operators, is reused to resolve every overload. Therefore at least $O(n^2)$ copies of expression nodes are made during resolution, not even counting type unification cost; combined with two large factors from number of overloads of pipe operators, and that the ``output stream type'' in \CFA is a trait with 27 assertions (which adds to complexity of the pipe operator's type) this makes compiling a long output sequence extremely slow. In new AST representation only $O(n)$ copies are required and type of pipe operator is not copied at all.
+Reduction in space complexity is especially important, as preliminary profiling result on the old compiler build shows that over half of time spent in expression resolution are on memory allocations.
+A major rework of the AST data-structure in the compiler was completed as the first step of the project. The majority of this work is documented in my prior report documenting the compiler reference-manual~\cite{cfa-cc}. To summarize:
+\begin{itemize}
+\item
+AST nodes (and therefore subtrees) can be shared without copying.
+\item
+Modifications are performed using functional-programming principles, making copies for local changes without affecting the original data shared by other owners. In-place mutations are permitted as a special case when there is no sharing. The logic is implemented by reference counting.
+\item
+Memory allocation and freeing are performed automatically using smart pointers~\cite{smartpointers}.
+\end{itemize}
+The resolver algorithm, designed for overload resolution, allows a significant amount of code reused, and hence copying, for the intermediate representations, especially in the following two places:
+\begin{itemize}
+\item
+Function overload candidates are computed by combining the argument candidates bottom-up, with many being a common term. For example, if $n$ overloads of a function @f@ all take an integer for the first parameter but different types for the second, \eg @f( int, int )@, @f( int, double )@, etc., the first term is copied $n$ times for each of the generated candidate expressions. This copying is particularly bad for deep expression trees.
+\item
+In the unification algorithm and candidate elimination step, actual types are obtained by substituting the type parameters by their bindings. Let $n$ be the complexity (\ie number of nodes in representation) of the original type, $m$ be the complexity of the bound type for parameters, and $k$ be the number of occurrences of type parameters in the original type. If every substitution needs to be deep-copied, these copy step takes $O(n+mk)$ time and memory, while using shared nodes it is reduced to $O(n)$ time and $O(k)$ memory.
+\end{itemize}
+One of the worst examples for the old compiler is a long chain of I/O operations:
+\begin{cfa}
+sout | 1 | 2 | 3 | 4 | ...;   // print integer constants
+\end{cfa}
+The pipe operator is overloaded by the \CFA I/O library for every primitive type in the C language, as well as I/O manipulators defined by the library. In total, there are around 50 overloads for the output stream operation. On resolving the $n$-th pipe operator in the sequence, the first term, which is the result of sub-expression containing $n-1$ pipe operators, is reused to resolve every overload. Therefore at least $O(n^2)$ copies of expression nodes are made during resolution, not even counting type unification cost; combined with the two large factors from number of overloads of pipe operators, and that the ``output stream type'' in \CFA is a trait with 27 assertions (which adds to complexity of the pipe operator's type) this makes compiling a long output sequence extremely slow. In the new AST representation, only $O(n)$ copies are required and the type of the pipe operator is not copied at all.
+Reduction in space complexity is especially important, as preliminary profiling results on the old compiler build showed over half of the time spent in expression resolution is on memory allocations.
+Since the compiler codebase is large and the new memory model mostly benefits expression resolution, some of the old data structures are still kept, and a conversion pass happens before and after the general resolve phase. Rewriting every compiler module will take longer, and whether the new model is correct was unknown when this project started, therefore only the resolver is currently implemented with the new data structure.
 \subsection{Merged resolver calls}
 The pre-resolve phase of compilation, inadequately called ``validate'' in the compiler source code, does more than just simple syntax validation, as it also normalizes input program. Some of them, however, requires type information on expressions and therefore needs to call the resolver before the general resolve phase. There are three notable places where the resolver is invoked:
 \begin{itemize}
 \item
+Attempt to generate default constructor, copy constructor and destructor for user-defined @struct@ types
 \item
 Resolve @with@ statements (the same as in Python, which introduces fields of a structure directly in scope)
+The pre-resolve phase of compilation, inappropriately called ``validate'' in the compiler source code, has a number of passes that do more than simple syntax and semantic validation; some passes also normalizes the input program. A few of these passes require type information for expressions, and therefore, need to call the resolver before the general resolve phase. There are three notable places where the resolver is invoked:
+\begin{itemize}
+\item
+Generate default constructor, copy constructor and destructor for user-defined @struct@ types.
+\item
+Resolve @with@ statements (the same as in Pascal~\cite{pascal}), which introduces fields of a structure directly into a scope.
 \item
 Resolve @typeof@ expressions (cf. @decltype@ in \CC); note that this step may depend on symbols introduced by @with@ statements.
 \end{itemize}
+Since the compiler codebase is large and the new memory model mostly only benefits expression resolution, the old data structure is still kept, and a conversion pass happens before and after resolve phase. Rewriting every compiler module will take a long time, and whether the new model is correct is still unknown when started, therefore only the resolver is implemented with the new data structure.
+Since the constructor calls were one of the most expensive to resolve (reason will be shown in the next section), pre-resolve phase were taking more time after resolver moves to the more efficient new implementation. To better facilitate the new resolver, every step that requires type information are reintegrated as part of resolver.
+A by-product of this work is that the reversed dependence of @with@ statement and @typeof@ can now be handled. Previously, the compiler is unable to handle cases such as
+Since the constructor calls are one of the most expensive to resolve (reason given in~\VRef{s:SpecialFunctionLookup}), this pre-resolve phase was taking a large amount of time even after the resolver was changed to the more efficient new implementation. The problem is that multiple resolutions repeat a significant amount of work. Therefore, to better facilitate the new resolver, every step that requires type information should be integrated as part of the general resolver phase.
+A by-product of this work is that reversed dependence between @with@ statement and @typeof@ can now be handled. Previously, the compiler was unable to handle cases such as:
 \begin{cfa}
 struct S { int x; };
 S foo();
 typeof( foo() ) s; // type is S
 with (s) {
+with (s) {
         x; // refers to s.x
+}
 \end{cfa}
 since type of @s@ is still unresolved when handling @with@ expressions. Instead, the new (and correct) approach is to evaluate @typeof@ expressions when the declaration is first seen, and it suffices because of the declaration-before-use rule.
+since the type of @s@ is unresolved when handling @with@ expressions because the @with@ pass follows the @typeof@ pass (interchanging passes only interchanges the problem). Instead, the new (and correct) approach is to evaluate @typeof@ expressions when the declaration is first seen during resolution, and it suffices because of the declaration-before-use rule.
 \subsection{Special function lookup}
+Reducing the number of functions looked up for overload resolution is an effective way to gain performance when there are many overloads but most of them are trivially wrong. In practice, most functions have few (if any) overloads but there are notable exceptions. Most importantly, constructor @?{}@, destructor @^?{}@, and assignment @?=?@ are generated for every user-defined type, and in a large source file there can be hundreds of them. Furthermore, many calls to them are generated for initializing variables and passing arguments. This fact makes them the most overloaded and most called functions.
+In an object-oriented programming language, object has methods declared with their types, so a call such as @obj.f()@ only needs to perform lookup in the method table corresponding to type of @obj@. \CFA on the other hand, does not have methods, and all types are open (\ie new operations can be defined on them), so a similar approach will not work in general. However, the ``big 3'' operators have a unique property enforced by the language rules, such that the first parameter must have a reference type. Since \CFA does not have class inheritance, reference type must always match exactly. Therefore, argument-dependent lookup can be implemented for these operators, by using a dedicated symbol table.
+The lookup key used for the special functions is the mangled type name of the first parameter, which acts as the @this@ parameter in an object-oriented language. To handle generic types, the type parameters are stripped off, and only the base type is matched. Note that a constructor (destructor, assignment operator) taking arbitrary @this@ argument, for example @forall( dtype T ) void ?{}( T & );@ is not allowed, and it guarantees that if the @this@ type is known, all possible overloads can be found by searching with the given type. In case that the @this@ argument itself is overloaded, it is resolved first and all possible result types are used for lookup.
+Note that for the generated expressions, the particular variable for @this@ argument is fully known, without overloads, so the majority of constructor call resolutions only need to check for one given object type. Explicit constructor calls and assignment statements sometimes may require lookup for multiple types. In the extremely rare case that type of @this@ argument is yet unbound, everything will have to be checked, just like without the argument-dependent lookup algorithm; fortunately, this case almost never happens in practice. An example is found in the library function @new@:
+\label{s:SpecialFunctionLookup}
+Reducing the number of function looked ups for overload resolution is an effective way to gain performance when there are many overloads but most of them are trivially wrong. In practice, most functions have few (if any) overloads but there are notable exceptions. Most importantly, constructor @?{}@, destructor @^?{}@, and assignment @?=?@ are generated for every user-defined type (@struct@ and @union@ in C), and in a large source file there can be hundreds of them. Furthermore, many calls are generated for initializing variables, passing arguments and copying values. This fact makes them the most overloaded and most called functions.
+In an object-oriented programming language, the object-method types are scoped within a class, so a call such as @obj.f()@ only needs to perform lookup in the method table corresponding to the type of @obj@. \CFA on the other hand, does not have methods, and all types are open, \ie new operations can be defined on them without inheritance; at best a \CFA type can be constrained by a translation unit. However, the ``big 3'' operators have a unique property enforced by the language rules: the first parameter must be a reference to its associated type, which acts as the @this@ parameter in an object-oriented language. Since \CFA does not have class inheritance, the reference type must always match exactly. Therefore, argument-dependent lookup can be implemented for these operators by using a dedicated, fast symbol-table.
+The lookup key for the special functions is the mangled type name of the first parameter. To handle generic types, the type parameters are stripped off, and only the base type is matched. Note a constructor (destructor, assignment operator) may not take an arbitrary @this@ argument, \eg @forall( dtype T ) void ?{}( T & )@, thus guaranteeing that if the @this@ type is known, all possible overloads can be found by searching with this given type. In the case where the @this@ argument itself is overloaded, it is resolved first and all possible result types are used for lookup.
+Note that for a generated expression, the particular variable for the @this@ argument is fully known, without overloads, so the majority of constructor-call resolutions only need to check for one given object type. Explicit constructor calls and assignment statements sometimes require lookup for multiple types. In the extremely rare case that the @this@-argument type is unbound, all necessary types are guaranteed to be checked, as for the previous lookup without the argument-dependent lookup; fortunately, this complex case almost never happens in practice. An example is found in the library function @new@:
 \begin{cfa}
 forall( dtype T | sized( T ), ttype TT | { void ?{}( T &, TT ); } )
 T * new( TT p ) { return &(*malloc()){ p }; }
 \end{cfa}
 as @malloc@ may return a pointer to any type, depending on context.
 Interestingly, this particular line of code actually caused another complicated issue, where the unusually massive work of checking every constructor in presence makes the case even worse. Section~\ref{s:TtypeResolutionInfiniteRecursion} presents a detailed analysis for the problem.
 The ``callable'' operator @?()@ (cf. @operator()@ in \CC) could also be included in the special operator list, as it is usually only on user-defined types, and the restriction that first argument must be a reference seems reasonable in this case.
+as @malloc@ may return a pointer to any type, depending on context.
+Interestingly, this particular declaration actually causes another complicated issue, making the complex checking of every constructor even worse. \VRef[Section]{s:TtypeResolutionInfiniteRecursion} presents a detailed analysis of this problem.
+The ``callable'' operator @?()@ (cf. @operator()@ in \CC) can also be included in this special operator list, as it is usually only on user-defined types, and the restriction that the first argument must be a reference seems reasonable in this case.
 \subsection{Improvement of function type representation}
+Since substituting type parameters with their bound types is one fundamental operation in many parts of resolver algorithm (particularly unification and environment binding), making as few copies of type nodes as possible helps reducing memory complexity. Even with the new memory management model, allocation is still a significant factor of resolver performance. Conceptually, operations on type nodes of AST should be performed in functional programming style, treating the data structure as immutable and only copy when necessary. The in-place mutation is a mere optimization that does not change logic of operations.
+The model was broken on function types by an inappropriate design. Function types require some special treatment due to the existence of assertions. In particular, it must be able to distinguish two different kinds of type parameter usage:
+Since substituting type parameters with their bound types is one fundamental operation in many parts of resolver algorithm (particularly unification and environment binding), making as few copies of type nodes as possible helps reducing memory complexity. Even with the new memory management model, allocation is still a significant factor of resolver performance. Conceptually, operations on type nodes of the AST should be performed in functional-programming style, treating the data structure as immutable and only copying when necessary. The in-place mutation is a mere optimization that does not change the logic for operations.
+However, the model was broken for function types by an inappropriate design. Function types require special treatment due to the existence of assertions that constrain the types it supports. Specifically, it must be possible to distinguish two different kinds of type parameter usage:
 \begin{cfa}
 forall( dtype T ) void foo( T * t ) {
         forall( dtype U ) void bar( T * t, U * u ) { ... }
+}
 \end{cfa}
 Here, only @U@ is a free parameter in declaration of @bar@, as it appears in the function's own forall clause; while @T@ is not free.
 Moreover, the resolution algorithm also has to distinguish type bindings of multiple calls to the same function, for example with
+        forall( dtype U ) void bar( @T@ * t, @U@ * u ) { ... }
+}
+\end{cfa}
+Here, only @U@ is a free parameter in the nested declaration of function @bar@, as @T@ must be bound at the call site when resolving @bar@.
+Moreover, the resolution algorithm also has to distinguish type bindings of multiple calls to the same function, \eg:
 \begin{cfa}
 forall( dtype T ) int foo( T x );
 foo( foo( 1.0 ) );
 \end{cfa}
 The inner call has binding (T: double) while the outer call has binding (T: int). Therefore a unique representation of free parameters in each expression is required. This was previously done by creating a copy of the parameter declarations inside function type, and fixing references afterwards. However, fixing references is an inherently deep operation that does not work well with functional programming model, as it must be evaluated eagerly on the entire syntax tree representing the function type.
 The revised approach generates a unique ID value for each function call expression instance and represents an occurrence of free parameter type with a pair of generated ID and the original parameter declaration, so that references do not need to be fixed, and a shallow copy of function type is possible.
 Note that after the change, all declaration nodes in syntax tree representation maps one-to-one with the actual declarations in the program, and therefore are guaranteed to be unique. Such property can potentially enable more optimizations, and some related ideas are presented after Section~\ref{s:SharedSub-ExpressionCaseUniqueExpressions}.
+int i = foo( foo( 1.0 ) );
+\end{cfa}
+The inner call has binding (T: double) while the outer call has binding (T: int). Therefore a unique representation for the free parameters is required in each expression. This type binding was previously done by creating a copy of the parameter declarations inside the function type and fixing references afterwards. However, fixing references is an inherently deep operation that does not work well with the functional-programming style, as it forces eager evaluation on the entire syntax tree representing the function type.
+The revised approach generates a unique ID value for each function call expression instance and represents an occurrence of a free-parameter type with a pair of generated ID and original parameter declaration, so references are unique and a shallow copy of the function type is possible.
+Note that after the change, all declaration nodes in the syntax-tree representation now map one-to-one with the actual declarations in the program, and therefore are guaranteed to be unique. This property can potentially enable more optimizations, and some related ideas are presented at the end of \VRef{s:SharedSub-ExpressionCaseUniqueExpressions}.
 \subsection{Improvement of pruning steps}
 A minor improvement for candidate elimination is to skip the step on the function overloads themselves and only perform on results of function application. As function calls are usually by name, the name resolution rule dictates that every function candidate necessarily has a different type; indirect function calls are rare, and when they do appear, they usually will not have many possible interpretations, and those rarely matches exactly in argument type. Since function types have a much more complex representation than data types (with multiple parameters and assertions), checking equality on them also takes longer.
 A brief test of this approach shows that the number of function overloads considered in expression resolution increases by a negligible amount of less than 1 percent, while type comparisons in candidate elimination are cut by more than half. Improvement is consistent over all \CFA source files in the test suite.
+A minor improvement for candidate elimination is to skip the step on the function overloads and only check the results of function application. As function calls are usually by name (versus pointers to functions), the name resolution rule dictates that every function candidate necessarily has a different type; indirect function calls are rare, and when they do appear, there are even fewer cases with multiple interpretations, and these rarely match exactly in argument type. Since function types have a much more complex representation (with multiple parameters and assertions) than data types, checking equality on them also takes longer.
+A brief test of this approach shows that the number of function overloads considered in expression resolution increases by an amount of less than 1 percent, while type comparisons in candidate elimination are reduced by more than half. This improvement is consistent over all \CFA source files in the test suite.
 …
 \label{s:SharedSub-ExpressionCaseUniqueExpressions}
 Unique expression denotes an expression that must be evaluated only once, to prevent unwanted side effects. It is currently only a compiler artifact, generated on tuple member expression of the form
+Unique expression denotes an expression evaluated only once to prevent unwanted side effects. It is currently only a compiler artifact, generated for tuple-member expression of the form:
 \begin{cfa}
 struct S { int a; int b; };
 …
 s.[a, b]; // tuple member expression, type is [int, int]
 \end{cfa}
 If the aggregate expression contains function calls, it cannot be evaluated multiple times:
+If the aggregate expression is function call, it cannot be evaluated multiple times:
 \begin{cfa}
 S makeS();
 makeS().[a, b]; // this should only make one S
+makeS().[a, b]; // this should only generate a unique S
 \end{cfa}
 Before code generation, the above expression is internally represented as
 …
 \end{cfa}
 at code generation, where @_unique_var@ and @_unique_var_evaluated@ are generated variables whose scope covers all appearances of the same expression.
+Note that although the unique expression is only used for tuple expansion now, it is a generally useful construction, and can be seen in other languages, such as Scala's @lazy val@~\cite{Scala}; therefore it could be worthwhile to introduce the unique expression to a broader context in \CFA and even make it directly available to programmers.
+In the compiler's visitor pattern, however, this creates a problem where multiple paths to a logically unique expression exist, so it may be modified more than once and become ill-formed; some specific intervention is required to ensure that unique expressions are only visited once. Furthermore, a unique expression appearing in more than one places will be copied on mutation so its representation is no longer unique. Some hacks are required to keep it in sync, and the methods are different when mutating the unique expression instance itself or its underlying expression.
+Example when mutating the underlying expression (visit-once guard)
+The conditional check ensures a single call to @makeS()@ even though there are logically multiple calls because of the tuple field expansion.
+Note that although the unique expression is only used for tuple expansion now, it is a generally useful construction, and is seen in other programming languages, such as Scala's @lazy val@~\cite{Scala}; therefore it may be worthwhile to introduce the unique expression to a broader context in \CFA and even make it directly available to programmers.
+In the compiler's visitor pattern, however, this creates a problem where multiple paths to a logically unique expression exist, so it may be modified more than once and become ill-formed; some specific intervention is required to ensure unique expressions are only visited once. Furthermore, a unique expression appearing in more than one places is copied on mutation so its representation is no longer unique.
+Currently, special cases are required to keep everything synchronized, and the methods are different when mutating the unique expression instance itself or its underlying expression:
+\begin{itemize}
+\item
+When mutating the underlying expression (visit-once guard)
 \begin{cfa}
 void InsertImplicitCalls::previsit( const ast::UniqueExpr * unqExpr ) {
         if ( visitedIds.count( unqExpr->id ) ) visit_children = false;
+        @if ( visitedIds.count( unqExpr->id ) ) visit_children = false;@
         else visitedIds.insert( unqExpr->id );
+}
 \end{cfa}
+Example when mutating the unique instance itself, which actually creates copies
+\item
+When mutating the unique instance itself, which actually creates copies
 \begin{cfa}
 auto mutExpr = mutate( unqExpr ); // internally calls copy when shared
+if ( ! unqMap.count( unqExpr->id ) ) {
+@if ( ! unqMap.count( unqExpr->id ) ) {@
         ...
 } else {
 …
+}
 \end{cfa}
+Such workaround seems difficult to be fit into a common visitor template. This suggests the memory model may need different kinds of nodes to accurately represent the syntax tree.
+Together with the fact that declaration nodes are always unique, it is possible that AST nodes can be classified by three different types:
+\begin{itemize}
+\item
+\textbf{Strictly unique} with only one owner (declarations);
+\item
+\textbf{Logically unique} with (possibly) many owners but should not be copied (unique expression example presented here);
+\item
+\textbf{Shared} by functional programming model, which assume immutable data structure and are copied on mutation.
+\end{itemize}
+Such workarounds are difficult to fit into the common visitor pattern, which suggests the memory model may need different kinds of nodes to accurately represent this feature in the AST.
+Given that declaration nodes are unique, it is possible for AST nodes to be divided into three different types:
+\begin{itemize}
+\item
+\textbf{Singleton} with only one owner (declarations);
+\item
+\textbf{No-copy} with multiple owners but cannot be copied (unique expression example presented here);
+\item
+\textbf{Copy} by functional-programming style, which assumes immutable data structures that are copied on mutation.
 \end{itemize}
 The boilerplate code can potentially handle these three cases differently.
 …
 \section{Analysis of resolver algorithm complexity}
 The focus of this chapter is to identify and analyze some realistic cases that cause resolver algorithm to have an exponential run time. As previous work has shown [3], the overload resolution problem in \CFA has worst-case exponential complexity; however, only few specific patterns can trigger the exponential complexity in practice. Implementing heuristic-based optimization for those selected cases is helpful to alleviate the problem.
+The focus of this section is to identify and analyze some realistic cases that cause the resolver algorithm to have an exponential runtime. As previous work has shown~\cite[\S~4.2.1]{Moss19}, the overload resolution problem in \CFA has worst-case exponential complexity; however, only few specific patterns can trigger the exponential complexity in practice. Implementing heuristic-based optimization for those selected cases is helpful to alleviate the problem.
 …
 \label{s:UnboundReturnType}
 The interaction of return type overloading and polymorphic functions creates this problem of function calls with unbound return type, and is further complicated by the presence of assertions.
+The interaction of return-type overloading and polymorphic functions creates function calls with unbounded return-type, and is further complicated by the presence of assertions.
 The prime example of a function with unbound return type is the type-safe version of C @malloc@:
 \begin{cfa}
+// size deduced from type, so no need to provide the size argument
+forall( dtype T | sized( T ) ) T * malloc( void );
+\end{cfa}
+Unbound return type can be problematic in resolver algorithm complexity because a single match of function call with unbound return type may create multiple candidates. In the worst case, consider a function declared to return any @otype@:
+forall( dtype T | sized( T ) )
+T * malloc( void ) { return (T *)malloc( sizeof(T) ); } // call C malloc
+int * i = malloc();  // type deduced from left-hand size $\(\Rightarrow\)$ no size argument or return cast
+\end{cfa}
+An unbound return-type is problematic in resolver complexity because a single match of a function call with an unbound return type may create multiple candidates. In the worst case, consider a function declared that returns any @otype@ (defined \VPageref{otype}):
 \begin{cfa}
 forall( otype T ) T anyObj( void );
 \end{cfa}
 As the resolver attempts to satisfy the otype constraint on @T@, a single call to @anyObj()@ without the result type known creates at least as many candidates as the number of complete types currently in scope; with generic types it becomes even worse, for example, assuming a declaration of generic pair is available at that point:
+As the resolver attempts to satisfy the otype constraint on @T@, a call to @anyObj()@ in an expression, without the result type known, creates at least as many candidates as the number of complete types currently in scope; with generic types it becomes even worse, \eg assuming a declaration of a generic @pair@ is available at that point:
 \begin{cfa}
 forall( otype T, otype U ) struct pair { T first; U second; };
 \end{cfa}
 Then an @anyObj()@ call can result in arbitrarily complex types, such as @pair( pair( int,int ), pair( int,int ) )@, and the depth can grow indefinitely until the specified parameter depth limit, thus creating exponentially many candidates. However, the expected types allowed by parent expressions are practically very few, so most of those interpretations are invalid; if the result type is never bound up to top level, by the semantic rules it is ambiguous if there are more than one valid bindings, and resolution can fail fast. It is therefore reasonable to delay resolving assertions on an unbound parameter in return type; however, with the current cost model, such behavior may further cause irregularities in candidate selection, such that the presence of assertions can change the preferred candidate, even when order of expression costs are supposed to stay the same. Detailed analysis of this issue will be presented later, in the correctness part.
+Then an @anyObj()@ call can result in arbitrarily complex types, such as @pair( pair( int, int ), pair( int, int ) )@, and the depth can grow indefinitely until a specified parameter-depth limit, thus creating exponentially many candidates. However, the expected types allowed by parent expressions are practically very few, so most of those interpretations are invalid; if the result type is never bound up to the top level, by the semantic rules it is ambiguous if there is more than one valid binding and resolution fails quickly. It is therefore reasonable to delay resolving assertions on an unbound parameter in a return type; however, with the current cost model, such behavior may further cause irregularities in candidate selection, such that the presence of assertions can change the preferred candidate, even when order of expression costs are supposed to stay the same. A detailed analysis of this issue is presented in \VRef{s:AnalysisTypeSystemCorrectness}.
 …
 \label{s:TtypeResolutionInfiniteRecursion}
 @ttype@ (``tuple type'') is a relatively new addition to the language that attempts to provide type-safe variadic argument semantics. Unlike regular @dtype@ parameters, @ttype@ is only valid in function parameter list, and may only appear once as the type of last parameter. At the call site, a @ttype@ parameter is bound to the tuple type of all remaining function call arguments.
+@ttype@ (``tuple type'') is a relatively new addition to the language that attempts to provide type-safe variadic argument semantics. Unlike regular @dtype@ parameters, @ttype@ is only valid in a function parameter-list, and may only appear once as the last parameter type. At the call site, a @ttype@ parameter is bound to the tuple type of all remaining function-call arguments.
 There are two kinds of idiomatic @ttype@ usage: one is to provide flexible argument forwarding, similar to the variadic template in \CC (\lstinline[language=C++]|template<typename... args>|), as shown below in the implementation of @unique_ptr@
 …
         T * data;
 };
 forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); })
 void ?{}( unique_ptr( T ) & this, Args args ) {
         this.data = new( args );
+}
 \end{cfa}
 the other is to implement structural recursion in the first-rest manner:
 \begin{cfa}
 forall( otype T, ttype Params | { void process( T ); void func( Params ); })
+forall( dtype T | sized( T ), @ttype Args@ | { void ?{}( T &, Args ); })
+void ?{}( unique_ptr( T ) & this, Args @args@ ) {
+        this.data = new( @args@ );  // forward constructor arguments to dynamic allocator
+}
+\end{cfa}
+The other usage is to implement structural recursion in the first-rest pattern:
+\begin{cfa}
+forall( otype T, @ttype Params@ | { void process( T ); void func( Params ); })
 void func( T arg1, Params p ) {
         process( arg1 );
+        func( p );
+}
+\end{cfa}
+For the second use case, it is important that the number of parameters in the recursive call go down, since the call site must deduce all assertion candidates, and that is only possible if by just looking at argument types (and not their values), the recursion is known to be completed in a finite number of steps.
+In recent experiments, however, some flaw in the type binding rules can lead to the first kind of @ttype@ use case produce an invalid candidate that the resolver enters an infinite loop.
+This bug was discovered in an attempt to raise assertion recursive depth limit and one of the library program takes exponentially longer time to compile. The cause of the problem is identified to be the following set of functions.
+File @memory.cfa@ contains
+\begin{cfa}
+#include "memory.hfa"
+#include "stdlib.hfa"
+\end{cfa}
+where file @memory.hfa@ contains the @unique_ptr@ declaration above, and two other similar functions with @ttype@ parameter:
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); }) {
+        func( @p@ );  // recursive call until base case of one argument
+}
+\end{cfa}
+For the second use case, it is imperative the number of parameters in the recursive call goes down, since the call site must deduce all assertion candidates, and that is only possible if by observation of the argument types (and not their values), the recursion is known to be completed in a finite number of steps.
+In recent experiments, however, a flaw in the type-binding rules can lead to the first kind of @ttype@ use case producing an invalid candidate and the resolver enters an infinite loop.
+This bug was discovered in an attempt to raise the assertion recursive-depth limit and one of the library programs took exponentially longer to compile. The cause of the problem is the following set of functions:
+\begin{cfa}
+// unique_ptr  declaration from above
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); } ) { // distribute forall clause
         void ?{}( counter_data( T ) & this, Args args );
         void ?{}( counter_ptr( T ) & this, Args args );
         void ?{}( unique_ptr( T ) & this, Args args );
+}
+\end{cfa}
+File @stdlib.hfa@ contains
+\begin{cfa}
 forall( dtype T | sized( T ), ttype TT | { void ?{}( T &, TT ); } )
+T * new( TT p ) { return &(*malloc()){ p }; }
+\end{cfa}
+In the expression @(*malloc()){p}@, the type of object being constructed is yet unknown, since the return type information is not immediately provided. That caused every constructor to be searched, and while normally a bound @ttype@ cannot be unified with any free parameter, it is possible with another free @ttype@. Therefore in addition to the correct option provided by assertion, 3 wrong options are examined, each of which again requires the same assertion, for an unknown base type T and @ttype@ arguments, and that becomes an infinite loop, until the specified recursion limit and resolution is forced to fail. Moreover, during the recursion steps, number of candidates grows exponentially, since there are always 3 options at each step.
+Unfortunately, @ttype@ to @ttype@ binding is necessary, to allow calling the function provided by assertion indirectly.
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); })
+void ?{}( unique_ptr( T ) & this, Args args ) { this.data = (T * )new( args ); }
+\end{cfa}
+Here the constructor assertion is used for the @new( args )@ call.
+T * new( TT p ) { return @&(*malloc()){ p };@ }
+\end{cfa}
+In the expression @(*malloc()){p}@, the type of the object being constructed is unknown, since the return-type information is not immediately available. That causes every constructor to be searched, and while normally a bound @ttype@ cannot be unified with any free parameter, it is possible with another free @ttype@. Therefore, in addition to the correct option provided by the assertion, 3 wrong options are examined, each of which again requires the same assertion, for an unknown base-type @T@ and @ttype@ argument, which becomes an infinite loop until the specified recursion limit and resolution is fails. Moreover, during the recursion steps, the number of candidates grows exponentially, since there are always 3 options at each step.
+Unfortunately, @ttype@ to @ttype@ binding is necessary, to allow indirectly calling a function provided in an assertion.
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { @void ?{}( T &, Args );@ })
+void ?{}( unique_ptr( T ) & this, Args args ) { this.data = (T *)@new( args )@; } // constructor call
+\end{cfa}
+Here the constructor assertion is used by the @new( args )@ call to indirectly call the constructor on the allocated storage.
 Therefore, it is hard, perhaps impossible, to solve this problem by tweaking the type binding rules. An assertion caching algorithm can help improve this case by detecting cycles in recursion.
 Meanwhile, without the caching algorithm implemented, some changes in the \CFA source code are enough to eliminate this problem, at least in the current codebase. Note that the issue only happens with an overloaded variadic function, which rarely appears in practice, since the idiomatic use cases are for argument forwarding and self-recursion. The only overloaded @ttype@ function so far discovered in all of \CFA standard library code is the constructor, and by utilizing the argument-dependent lookup process described in Section~\ref{s:UnboundReturnType}, adding a cast before constructor call gets rid of the issue.
 \begin{cfa}
 T * new( TT p ) { return &(*(T * )malloc()){ p }; }
+Meanwhile, without a caching algorithm implemented, some changes in the \CFA source code are enough to eliminate this problem, at least in the current codebase. Note that the issue only happens with an overloaded variadic function, which rarely appears in practice, since the idiomatic use cases are for argument forwarding and self-recursion. The only overloaded @ttype@ function so far discovered in all of \CFA standard library is the constructor, and by utilizing the argument-dependent lookup process described in \VRef{s:UnboundReturnType}, adding a cast before the constructor call removes the issue.
+\begin{cfa}
+T * new( TT p ) { return &(*@(T * )@malloc()){ p }; }
 \end{cfa}
 …
 \subsection{Reused assertions in nested generic type}
 The following test of deeply nested dynamic generic type reveals that locally caching reused assertions is necessary, rather than just a resolver optimization, because recomputing assertions can result in bloated generated code size:
+The following test of deeply nested, dynamic generic type reveals that locally caching reused assertions is necessary, rather than just a resolver optimization, because recomputing assertions can result in bloated generated code size:
 \begin{cfa}
 struct nil {};
 …
 int main() {
         #if   N==0
         nil x;
+        nil @x@;
         #elif N==1
         cons( size_t, nil ) x;
+        cons( size_t, nil ) @x@;
         #elif N==2
         cons( size_t, cons( size_t, nil ) ) x;
+        cons( size_t, cons( size_t, nil ) ) @x@;
         #elif N==3
         cons( size_t, cons( size_t, cons( size_t, nil ) ) ) x;
+        cons( size_t, cons( size_t, cons( size_t, nil ) ) ) @x@;
         // similarly for N=4,5,6
         #endif
+}
 \end{cfa}
 At the declaration of @x@, it is implicitly initialized by generated constructor call, whose signature is given by
+At the declaration of @x@, it is implicitly initialized by generated constructor call, with signature:
 \begin{cfa}
 forall( otype L, otype R ) void ?{}( cons( L, R ) & );
 \end{cfa}
+Note that the @otype@ constraint contains 4 assertions:
+where the @otype@ constraint contains the 4 assertions:\label{otype}
 \begin{cfa}
 void ?{}( L & ); // default constructor
 …
 L & ?=?( L &, L & ); // assignment
 \end{cfa}
+Now since the right hand side of outermost cons is again a cons, recursive assertions are required. When the compiler cannot cache and reuse already resolved assertions, it becomes a problem, as each of those 4 pending assertions again asks for 4 more assertions one level below. Without any caching, number of resolved assertions grows exponentially, while that is obviously unnecessary since there are only $n+1$ different types involved. Even worse, this causes exponentially many wrapper functions generated later at the codegen step, and results in huge compiled binary.
 \begin{table}[h]
+\begin{table}[htb]
+\centering
 \caption{Compilation results of nested cons test}
+\label{t:NestedConsTest}
 \begin{tabular}{|r|r|r|}
 \hline
 …
 \end{table}
+As the local functions are implemented by emitting executable code on the stack~\cite{gcc-nested-func}, it eventually means that compiled code also has exponential run time. This problem has evident practical implications, as nested collection types are frequently used in real production code.
+Now since the right hand side of outermost cons is again a cons, recursive assertions are required. \VRef[Table]{t:NestedConsTest} shows when the compiler does not cache and reuse already resolved assertions, it becomes a problem, as each of these 4 pending assertions again asks for 4 more assertions one level below. Without caching, the number of resolved assertions grows exponentially, which is unnecessary since there are only $n+1$ different types involved. Even worse, this problem causes exponentially many wrapper functions to be generated at the backend, resulting in a huge binary. As the local functions are implemented by emitting executable code on the stack~\cite{gcc-nested-func}, it means that compiled code also has exponential run time. This problem has practical implications, as nested collection types are frequently used in real production code.
 \section{Analysis of type system correctness}
+\label{s:AnalysisTypeSystemCorrectness}
 In Moss' thesis~\cite[\S~4.1.2,~p.~45]{Moss19}, the author presents the following example:
 …
 \begin{cfa}
 void f( int );
 double g$_1$( int );
 int g$_2$( long );
+double g$\(_1\)$( int );
+int g$\(_2\)$( long );
 f( g( 42 ) );
 \end{cfa}
 …
 From the set of candidates whose parameter and argument types have been unified and whose assertions have been satisfied, those whose sub-expression interpretations have the smallest total cost of conversion are selected ... The total cost of conversion for each of these candidates is then calculated based on the implicit conversions and polymorphism involved in adapting the types of the sub-expression interpretations to the formal parameter types.
 \end{quote}
+With this model, the algorithm picks @g1@ in resolving the @f( g( 42 ) )@ call, which seems to be undesirable.
+There are further evidence that shows the Bilson model is fundamentally incorrect, following the discussion of unbound return type in Section~\ref{s:UnboundReturnType}. By the conversion cost specification, a binding from a polymorphic type parameter to a concrete type incurs a polymorphic cost of 1. It remains unspecified \emph{when} the type parameters should become bound. When the parameterized types appear in the function parameters, they can be deduced from the argument type, and there is no ambiguity. In the unbound return case, however, the binding may happen at any stage in expression resolution, therefore it is impossible to define a unique local conversion cost. Note that type binding happens exactly once per parameter in resolving the entire expression, so the global binding cost is unambiguously 1.
+As per the current compiler implementation, it does have a notable inconsistency in handling such case. For any unbound parameter that does \emph{not} come with an associated assertion, it remains unbound to the parent expression; for those that does however, they are immediately bound in the assertion resolution step, and concrete result types are used in the parent expressions.
+With this model, the algorithm picks @g1@ in resolving the @f( g( 42 ) )@ call, which is undesirable.
+There is further evidence that shows the Bilson model is fundamentally incorrect, following the discussion of unbound return type in \VRef{s:UnboundReturnType}. By the conversion-cost specification, a binding from a polymorphic type-parameter to a concrete type incurs a polymorphic cost of 1. It remains unspecified \emph{when} the type parameters should become bound. When the parameterized types appear in function parameters, they can be deduced from the argument type, and there is no ambiguity. In the unbound return case, however, the binding may happen at any stage in expression resolution, therefore it is impossible to define a unique local conversion cost. Note that type binding happens exactly once per parameter in resolving the entire expression, so the global binding cost is unambiguously 1.
+In the current compiler implementation, there is a notable inconsistency in handling this case. For any unbound parameter that does \emph{not} come with an associated assertion, it remains unbound to the parent expression; for those that do, however, they are immediately bound in the assertion resolution step, and concrete result types are used in the parent expressions.
 Consider the following example:
 \begin{cfa}
 …
 void h( int * );
 \end{cfa}
 The expression @h( f() )@ eventually has a total cost of 1 from binding (T: int), but in the eager resolution model, the cost of 1 may occur either at call to @f@ or at call to @h@, and with the assertion resolution triggering a binding, the local cost of @f()@ is (0 poly, 0 spec) with no assertions, but (1 poly, -1 spec) with an assertion:
 \begin{cfa}
 forall( dtype T | { void g( T * ); } ) T * f( void );
+The expression @h( f() )@ eventually has a total cost of 1 from binding (T: int), but in the eager-resolution model, the cost of 1 may occur either at the call to @f@ or at call to @h@, and with the assertion resolution triggering a binding, the local cost of @f()@ is (0 poly, 0 spec) with no assertions, but (1 poly, -1 spec) with an assertion:
+\begin{cfa}
+forall( dtype T | @{ void g( T * ); }@ ) T * f( void );
 void g( int * );
 void h( int * );
 \end{cfa}
 and that contradicts the principle that adding assertions should make expression cost lower. Furthermore, the time at which type binding and assertion resolution happens is an implementation detail of the compiler, but not a part of language definition. That means two compliant \CFA compilers, one performing immediate assertion resolution at each step, and one delaying assertion resolution on unbound types, can produce different expression costs and therefore different candidate selection, making the language rule itself partially undefined and therefore unsound. By the above reasoning, the updated cost model using global sum of costs should be accepted as the standard. It also allows the compiler to freely choose when to resolve assertions, as the sum of total costs is independent of that choice; more optimizations regarding assertion resolution can also be implemented.
+and that contradicts the principle that adding assertions should make expression cost lower. Furthermore, the time at which type binding and assertion resolution happens is an implementation detail of the compiler, not part of the language definition. That means two compliant \CFA compilers, one performing immediate assertion resolution at each step, and one delaying assertion resolution on unbound types, can produce different expression costs and therefore different candidate selection, making the language rule itself partially undefined, and therefore, unsound. By the above reasoning, the updated cost model using global sum of costs should be accepted as the standard. It also allows the compiler to freely choose when to resolve assertions, as the sum of total costs is independent of that choice; more optimizations regarding assertion resolution can also be implemented.
 \section{Timing results}
+For the timing results presented here, the \CFA compiler is built with gcc 9.3.0, and tested on a server machine running Ubuntu 20.04, 64GB RAM and 32-core 2.2 GHz CPU, results reported by the time command, and using only 8 cores in parallel such that the time is close to the case with 100% CPU utilization on a single thread.
+On the most recent build, the \CFA standard library (~1.3 MB of source code) compiles in 4 minutes 47 seconds total processor time (single thread equivalent), with the slowest file taking 13 seconds. The test suite (178 test cases, ~2.2MB of source code) completes within 25 minutes total processor time,\footnote{Including a few runtime tests; total time spent in compilation is approximately 21 minutes.} with the slowest file taking 23 seconds. In contrast, the library build on old compiler takes 85 minutes total, 5 minutes for the slowest file. Full test suite takes too long with old compiler build and is therefore not run, but the slowest test cases take approximately 5 minutes. Overall, the most recent build compared to old build in April 2020, before the project started, is consistently faster by a factor of 20.
+Additionally, 6 selected \CFA source files with distinct features from library and test suite are used to test compiler performance after each of the optimizations are implemented. Test files are from the most recent build and run through C preprocessor to eliminate the factor of header file changes. The selected tests are:
+\begin{itemize}
+\item
+@lib/fstream@ (112 KB)\footnote{File sizes are after preprocessing, with no line information (\lstinline|gcc -E -P|).}: implementation of I/O library
+For the timing results presented here, the \CFA compiler is built with gcc 9.3.0, and tested on a server machine running Ubuntu 20.04, 64GB RAM and 32-core 2.2 GHz CPU.
+Timing is reported by the @time@ command and an experiment is run using 8 cores, where each core is at 100\% CPU utilization.
+On the most recent build, the \CFA standard library ($\approx$1.3 MB of source code) compiles in 4 minutes 47 seconds total processor time (single thread equivalent), with the slowest file taking 13 seconds. The test suite (178 test cases, $\approx$2.2MB of source code) completes within 25 minutes total processor time,
+% PAB: I do not understand this footnote.
+%\footnote{Including a few runtime tests; total time spent in compilation is approximately 21 minutes.}
+with the slowest file taking 23 seconds. In contrast, the library build with the old compiler takes 85 minutes total, 5 minutes for the slowest file. The full test-suite takes too long with old compiler build and is therefore not run, but the slowest test cases take approximately 5 minutes. Overall, the most recent build compared to an old build is consistently faster by a factor of 20.
+Additionally, 6 selected \CFA source files with distinct features from the library and test suite are used to illustrate the compiler performance change after each of the implemented optimizations. Test files are from the most recent build and run through the C preprocessor to expand header file, perform macro expansions, but no line number information (@gcc -E -P@).
+\VRef[Table]{t:SelectedFileByCompilerBuild} shows the selected tests:
+\begin{itemize}
+\item
+@lib/fstream@ (112 KB)
 \item
 @lib/mutex@ (166 KB): implementation of concurrency primitive
 …
 @lib/stdlib@ (64 KB): type-safe wrapper to @void *@-based C standard library functions
 \item
 @test/ISO2@ (55 KB): application of I/O library
+@test/io2@ (55 KB): application of I/O library
 \item
 @test/thread@ (188 KB): application of threading library
 \end{itemize}
+The \CFA compiler builds are picked from git commit history that passed the test suite, and implement the optimizations incrementally:
+\begin{itemize}
+\item
+\#0 is the first working build of new AST data structure
+versus \CFA compiler builds picked from the git commit history that implement the optimizations incrementally:
+\begin{itemize}
+\item
+old resolver
+\item
+\#0 is the first working build of the new AST data structure
 \item
 \#1 implements special symbol table and argument-dependent lookup
 \item
+\#2 implements late assertion satisfaction
+\item
+\#3 implements revised function type representation
+\item
+\#4 skips pruning on expressions with function type (most recent build)
+\end{itemize}
+The old resolver with no memory sharing and none of the optimizations above is also tested.
+\begin{table}
+\#2 implements late assertion-satisfaction
+\item
+\#3 implements revised function-type representation
+\item
+\#4 skips pruning on expressions for function types (most recent build)
+\end{itemize}
+Reading left to right for a test shows the benefit of each optimization on the cost of compilation.
+\begin{table}[htb]
+\centering
 \caption{Compile time of selected files by compiler build, in seconds}
+\label{t:SelectedFileByCompilerBuild}
 \begin{tabular}{|l|r|r|r|r|r|r|}
 \hline
 …
 \end{table}
 \section{Conclusion}
 Over the course of 8 months of active research and development in \CFA type system and compiler algorithm, performance of the reference \CFA compiler, cfa-cc, has been greatly improved, allowing mid-sized \CFA programs to be compiled and built reasonably fast. As there are also ongoing efforts in the team on building a standard library, evaluating the runtime performance, and attempting to incorporate \CFA with existing software written in C, this project is especially meaningful for practical purposes.
 Analysis conducted in the project were based significantly on heuristics and practical evidence, as the theoretical bounds and average cases for the expression resolution problem differ. This approach was difficult at start to follow, with an unacceptably slow compiler, since running the program through debugger and validation tools (\eg @gdb@, @valgrind@) adds another order of magnitude to run time, which was already in minutes. However, near the end of the project, many significant improvements have already been made and new optimizations can be tested immediately. The positive feedback in development cycle benefits the \CFA team as a whole, more than just for the compiler optimizations.
 Some potential issues of the language that may happen frequently in practice have been identified. Due to the time constraint and complex nature of these problems, a handful of them remain unsolved, but some constructive proposals are made. Notably, introducing a local assertion cache in the resolver is a common solution for a few remaining problems, so that should be the focus of work soon.
 The \CFA team are planning on a public alpha release of the language as the compiler performance becomes promising, and other parts of the system, such as a standard library, are also being enhanced. Ideally, the remaining problems should be resolved before release, and the solutions will also be integral to drafting a formal specification.
+Over the course of 8 months of active research and development of the \CFA type system and compiler algorithms, performance of the reference \CFA compiler, cfa-cc, has been greatly improved. Now, mid-sized \CFA programs are compiled reasonably fast. Currently, there are ongoing efforts by the \CFA team to augment the standard library and evaluate its runtime performance, and incorporate \CFA with existing software written in C; therefore this project is especially meaningful for these practical purposes.
+Accomplishing this work was difficult. Analysis conducted in the project is based significantly on heuristics and practical evidence, as the theoretical bounds and average cases for the expression resolution problem differ. As well, the slowness of the initial compiler made attempts to understand why and where problems exist extremely difficult because both debugging and validation tools (\eg @gdb@, @valgrind@, @pref@) further slowed down compilation time. However, by the end of the project, I had found and fixed several significant problems and new optimizations are easier to introduce and test. The reduction in the development cycle benefits the \CFA team as a whole.
+Some potential issues of the language, which happen frequently in practice, have been identified. Due to the time constraint and complex nature of these problems, a handful of them remain unsolved, but some constructive proposals are made. Notably, introducing a local assertion cache in the resolver is a reasonable solution for a few remaining problems, so that should be the focus of future work.
+The \CFA team are planning on a public alpha release of the language as the compiler performance, given my recent improvements, is now useable. Other parts of the system, such as the standard library, have made significant gains due to the speed up in the development cycle. Ideally, the remaining problems should be resolved before release, and the solutions will also be integral to drafting a formal specification.
 \addcontentsline{toc}{section}{\refname}

doc/theses/fangren_yu_COOP_S20/Report.tex

r342af53	r8e4aa05
17	17	\usepackage[usenames]{color}
18	18	\input{common} % common CFA document macros
19		\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,~~pagebackref=true,~~linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
	19	\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
20	20	\usepackage{breakurl}
21	21	\urlstyle{sf}

doc/theses/thierry_delisle_PhD/thesis/Makefile

-              r342af53
+              r8e4aa05
 BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex
 MAKEFLAGS = --no-print-directory --silent
+MAKEFLAGS = --no-print-directory # --silent
 VPATH = ${Build} ${Figures}
 …
         emptytree \
         fairness \
+        io_uring \
+        pivot_ring \
         system \
+}
 …
 ## Define the documents that need to be made.
 all: thesis.pdf
 thesis.pdf: ${TEXTS} ${FIGURES} ${PICTURES} glossary.tex local.bib
+thesis.pdf: ${TEXTS} ${FIGURES} ${PICTURES} thesis.tex glossary.tex local.bib
 DOCUMENT = thesis.pdf
 …
 # Directives #
+.NOTPARALLEL:                                           # cannot make in parallel
 .PHONY : all clean                                      # not file names
 …
         ${LaTeX} $<
-build/fairness.svg : fig/fairness.py | ${Build}
-        python3 $< $@
 ## Define the default recipes.
 …
         sed -i 's/$@/${Build}\/$@/g' ${Build}/$@_t
+build/fairness.svg : fig/fairness.py | ${Build}
+        python3 $< $@
 ## pstex with inverted colors
 %.dark.pstex : fig/%.fig Makefile | ${Build}

doc/theses/thierry_delisle_PhD/thesis/local.bib

-              r342af53
+              r8e4aa05
+}
+@manual{MAN:bsd/kqueue,
+  title = {KQUEUE(2) - FreeBSD System Calls Manual},
+  url   = {https://www.freebsd.org/cgi/man.cgi?query=kqueue},
+  year  = {2020},
+  month = {may}
+}
 % Apple's MAC OS X
 @manual{MAN:apple/scheduler,
 …
 % --------------------------------------------------
+% Man Pages
+@manual{MAN:open,
+  key        = "open",
+  title      = "open(2) Linux User's Manual",
+  year       = "2020",
+  month      = "February",
+}
+@manual{MAN:accept,
+  key        = "accept",
+  title      = "accept(2) Linux User's Manual",
+  year       = "2019",
+  month      = "March",
+}
+@manual{MAN:select,
+  key        = "select",
+  title      = "select(2) Linux User's Manual",
+  year       = "2019",
+  month      = "March",
+}
+@manual{MAN:poll,
+  key        = "poll",
+  title      = "poll(2) Linux User's Manual",
+  year       = "2019",
+  month      = "July",
+}
+@manual{MAN:epoll,
+  key        = "epoll",
+  title      = "epoll(7) Linux User's Manual",
+  year       = "2019",
+  month      = "March",
+}
+@manual{MAN:aio,
+  key        = "aio",
+  title      = "aio(7) Linux User's Manual",
+  year       = "2019",
+  month      = "March",
+}
+@misc{MAN:io_uring,
+  title   = {Efficient IO with io\_uring},
+  author  = {Axboe, Jens},
+  year    = "2019",
+  month   = "March",
+  version = {0,4},
+  howpublished = {\url{https://kernel.dk/io_uring.pdf}}
+}
+% --------------------------------------------------
 % Wikipedia Entries
 @misc{wiki:taskparallel,
 …
   note = "[Online; accessed 2-January-2021]"
+}
+@misc{wiki:future,
+  author = "{Wikipedia contributors}",
+  title = "Futures and promises --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2020",
+  url = "https://en.wikipedia.org/wiki/Futures_and_promises",
+  note = "[Online; accessed 9-February-2021]"
+}

doc/theses/thierry_delisle_PhD/thesis/text/core.tex

-              r342af53
+              r8e4aa05
 \section{Design}
 In general, a na\"{i}ve \glsxtrshort{fifo} ready-queue does not scale with increased parallelism from \glspl{hthrd}, resulting in decreased performance. The problem is adding/removing \glspl{thrd} is a single point of contention. As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}. The common solution to the single point of contention is to shard the ready-queue so each \gls{hthrd} can access the ready-queue without contention, increasing performance though lack of contention.
+In general, a na\"{i}ve \glsxtrshort{fifo} ready-queue does not scale with increased parallelism from \glspl{hthrd}, resulting in decreased performance. The problem is adding/removing \glspl{thrd} is a single point of contention. As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}. The common solution to the single point of contention is to shard the ready-queue so each \gls{hthrd} can access the ready-queue without contention, increasing performance.
 \subsection{Sharding} \label{sec:sharding}
 An interesting approach to sharding a queue is presented in \cit{Trevors paper}. This algorithm presents a queue with a relaxed \glsxtrshort{fifo} guarantee using an array of strictly \glsxtrshort{fifo} sublists as shown in Figure~\ref{fig:base}. Each \emph{cell} of the array has a timestamp for the last operation and a pointer to a linked-list with a lock and each node in the list is marked with a timestamp indicating when it is added to the list. A push operation is done by picking a random cell, acquiring the list lock, and pushing to the list. If the cell is locked, the operation is simply retried on another random cell until a lock is acquired. A pop operation is done in a similar fashion except two random cells are picked. If both cells are unlocked with non-empty lists, the operation pops the node with the oldest cell timestamp. If one of the cells is unlocked and non-empty, the operation pops from that cell. If both cells are either locked or empty, the operation picks two new random cells and tries again.
+An interesting approach to sharding a queue is presented in \cit{Trevors paper}. This algorithm presents a queue with a relaxed \glsxtrshort{fifo} guarantee using an array of strictly \glsxtrshort{fifo} sublists as shown in Figure~\ref{fig:base}. Each \emph{cell} of the array has a timestamp for the last operation and a pointer to a linked-list with a lock. Each node in the list is marked with a timestamp indicating when it is added to the list. A push operation is done by picking a random cell, acquiring the list lock, and pushing to the list. If the cell is locked, the operation is simply retried on another random cell until a lock is acquired. A pop operation is done in a similar fashion except two random cells are picked. If both cells are unlocked with non-empty lists, the operation pops the node with the oldest timestamp. If one of the cells is unlocked and non-empty, the operation pops from that cell. If both cells are either locked or empty, the operation picks two new random cells and tries again.
 \begin{figure}
 …
 \paragraph{Local Information} Figure~\ref{fig:emptytls} shows an approach using dense information, similar to the bitmap, but each \gls{hthrd} keeps its own independent copy. While this approach can offer good scalability \emph{and} low latency, the liveliness and discovery of the information can become a problem. This case is made worst in systems with few processors where even blind random picks can find \glspl{thrd} in a few tries.
 I built a prototype of these approaches and none of these techniques offer satisfying performance when few threads are present. All of these approach hit the same 2 problems. First, randomly picking sub-queues is very fast but means any improvement to the hit rate can easily be countered by a slow-down in look-up speed when there are empty lists. Second, the array is already as sharded to avoid contention bottlenecks, so any denser data structure tends to become a bottleneck. In all cases, these factors meant the best cases scenario, \ie many threads, would get worst throughput, and the worst-case scenario, few threads, would get a better hit rate, but an equivalent poor throughput. As a result I tried an entirely different approach.
+I built a prototype of these approaches and none of these techniques offer satisfying performance when few threads are present. All of these approach hit the same 2 problems. First, randomly picking sub-queues is very fast. That speed means any improvement to the hit rate can easily be countered by a slow-down in look-up speed, whether or not there are empty lists. Second, the array is already sharded to avoid contention bottlenecks, so any denser data structure tends to become a bottleneck. In all cases, these factors meant the best cases scenario, \ie many threads, would get worst throughput, and the worst-case scenario, few threads, would get a better hit rate, but an equivalent poor throughput. As a result I tried an entirely different approach.
 \subsection{Dynamic Entropy}\cit{https://xkcd.com/2318/}
 In the worst-case scenario there are only few \glspl{thrd} ready to run, or more precisely given $P$ \glspl{proc}\footnote{For simplicity, this assumes there is a one-to-one match between \glspl{proc} and \glspl{hthrd}.}, $T$ \glspl{thrd} and $\epsilon$ a very small number, than the worst case scenario can be represented by $\epsilon \ll P$, than $T = P + \epsilon$. It is important to note in this case that fairness is effectively irrelevant. Indeed, this case is close to \emph{actually matching} the model of the ``Ideal multi-tasking CPU'' on page \pageref{q:LinuxCFS}. In this context, it is possible to use a purely internal-locality based approach and still meet the fairness requirements. This approach simply has each \gls{proc} running a single \gls{thrd} repeatedly. Or from the shared ready-queue viewpoint, each \gls{proc} pushes to a given sub-queue and then popes from the \emph{same} subqueue. In cases where $T \gg P$, the scheduler should also achieves similar performance without affecting the fairness guarantees.
+In the worst-case scenario there are only few \glspl{thrd} ready to run, or more precisely given $P$ \glspl{proc}\footnote{For simplicity, this assumes there is a one-to-one match between \glspl{proc} and \glspl{hthrd}.}, $T$ \glspl{thrd} and $\epsilon$ a very small number, than the worst case scenario can be represented by $T = P + \epsilon$, with $\epsilon \ll P$. It is important to note in this case that fairness is effectively irrelevant. Indeed, this case is close to \emph{actually matching} the model of the ``Ideal multi-tasking CPU'' on page \pageref{q:LinuxCFS}. In this context, it is possible to use a purely internal-locality based approach and still meet the fairness requirements. This approach simply has each \gls{proc} running a single \gls{thrd} repeatedly. Or from the shared ready-queue viewpoint, each \gls{proc} pushes to a given sub-queue and then pops from the \emph{same} subqueue. The challenge is for the the scheduler to achieve good performance in both the $T = P + \epsilon$ case and the $T \gg P$ case, without affecting the fairness guarantees in the later.
 To handle this case, I use a pseudo random-number generator, \glsxtrshort{prng} in a novel way. When the scheduler uses a \glsxtrshort{prng} instance per \gls{proc} exclusively, the random-number seed effectively starts an encoding that produces a list of all accessed subqueues, from latest to oldest. The novel approach is to be able to ``replay'' the \glsxtrshort{prng} backwards and there exist \glsxtrshort{prng}s that are fast, compact \emph{and} can be run forward and backwards. Linear congruential generators~\cite{wiki:lcg} are an example of \glsxtrshort{prng}s that match these requirements.
+To handle this case, I use a \glsxtrshort{prng}\todo{Fix missing long form} in a novel way. There exist \glsxtrshort{prng}s that are fast, compact and can be run forward \emph{and} backwards.  Linear congruential generators~\cite{wiki:lcg} are an example of \glsxtrshort{prng}s of such \glsxtrshort{prng}s. The novel approach is to use the ability to run backwards to ``replay'' the \glsxtrshort{prng}. The scheduler uses an exclusive \glsxtrshort{prng} instance per \gls{proc}, the random-number seed effectively starts an encoding that produces a list of all accessed subqueues, from latest to oldest. Replaying the \glsxtrshort{prng} to identify cells accessed recently and which probably have data still cached.
 The algorithm works as follows:

doc/theses/thierry_delisle_PhD/thesis/text/intro.tex

r342af53	r8e4aa05
7	7	While previous work on the concurrent package of \CFA focused on features and interfaces, this thesis focuses on performance, introducing \glsxtrshort{api} changes only when required by performance considerations. More specifically, this thesis concentrates on scheduling and \glsxtrshort{io}. Prior to this work, the \CFA runtime used a strictly \glsxtrshort{fifo} \gls{rQ}.
8	8
9		This work exclusively concentrates on Linux as it's operating system since the existing \CFA runtime and compiler does not already support other operating systems. Furthermore, as \CFA is yet to be released, supporting version of Linux older that the latest version is not a goal of this work.
	9	This work exclusively concentrates on Linux as it's operating system since the existing \CFA runtime and compiler does not already support other operating systems. Furthermore, as \CFA is yet to be released, supporting version of Linux older than the latest version is not a goal of this work.

doc/theses/thierry_delisle_PhD/thesis/text/io.tex

-              r342af53
+              r8e4aa05
 \chapter{User Level \glsxtrshort{io}}
 As mentionned in Section~\ref{prev:io}, User-Level \glsxtrshort{io} requires multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \glsxtrshort{io} operations. Various operating systems offer various forms of asynchronous operations and as mentioned in Chapter~\ref{intro}, this work is exclusively focuesd on Linux.
+\chapter{User Level \io}
+As mentioned in Section~\ref{prev:io}, User-Level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations. Different operating systems offer various forms of asynchronous operations and as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating-system.
 \section{Existing options}
 Since \glsxtrshort{io} operations are generally handled by the
+\section{Kernel Interface}
+Since this work fundamentally depends on operating-system support, the first step of any design is to discuss the available interfaces and pick one (or more) as the foundations of the non-blocking \io subsystem.
+\subsection{\lstinline|epoll|, \lstinline|poll| and \lstinline|select|}
+\subsection{\lstinline{O_NONBLOCK}}
+In Linux, files can be opened with the flag @O_NONBLOCK@~\cite{MAN:open} (or @SO_NONBLOCK@~\cite{MAN:accept}, the equivalent for sockets) to use the file descriptors in ``nonblocking mode''. In this mode, ``Neither the @open()@ nor any subsequent \io operations on the [opened file descriptor] will cause the calling
+process to wait''~\cite{MAN:open}. This feature can be used as the foundation for the non-blocking \io subsystem. However, for the subsystem to know when an \io operation completes, @O_NONBLOCK@ must be use in conjunction with a system call that monitors when a file descriptor becomes ready, \ie, the next \io operation on it does not cause the process to wait\footnote{In this context, ready means \emph{some} operation can be performed without blocking. It does not mean an operation returning \lstinline{EAGAIN} succeeds on the next try. For example, a ready read may only return a subset of bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}.
+This mechanism is also crucial in determining when all \glspl{thrd} are blocked and the application \glspl{kthrd} can now block.
+\subsection{Linux's AIO}
+There are three options to monitor file descriptors in Linux\footnote{For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}. The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.}, @select@~\cite{MAN:select}, @poll@~\cite{MAN:poll} and @epoll@~\cite{MAN:epoll}. All three of these options offer a system call that blocks a \gls{kthrd} until at least one of many file descriptors becomes ready. The group of file descriptors being waited is called the \newterm{interest set}.
+\paragraph{\lstinline{select}} is the oldest of these options, it takes as an input a contiguous array of bits, where each bits represent a file descriptor of interest. On return, it modifies the set in place to identify which of the file descriptors changed status. This destructive change means that calling select in a loop requires re-initializing the array each time and the number of file descriptors supported has a hard limit. Another limit of @select@ is that once the call is started, the interest set can no longer be modified. Monitoring a new file descriptor generally requires aborting any in progress call to @select@\footnote{Starting a new call to \lstinline{select} is possible but requires a distinct kernel thread, and as a result is not an acceptable multiplexing solution when the interest set is large and highly dynamic unless the number of parallel calls to \lstinline{select} can be strictly bounded.}.
+\paragraph{\lstinline{poll}} is an improvement over select, which removes the hard limit on the number of file descriptors and the need to re-initialize the input on every call. It works using an array of structures as an input rather than an array of bits, thus allowing a more compact input for small interest sets. Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed while the call is blocked.
+\paragraph{\lstinline{epoll}} further improves these two functions by allowing the interest set to be dynamically added to and removed from while a \gls{kthrd} is blocked on an @epoll@ call. This dynamic capability is accomplished by creating an \emph{epoll instance} with a persistent interest set, which is used across multiple calls. This capability significantly reduces synchronization overhead on the part of the caller (in this case the \io subsystem), since the interest set can be modified when adding or removing file descriptors without having to synchronize with other \glspl{kthrd} potentially calling @epoll@.
+However, all three of these system calls have limitations. The @man@ page for @O_NONBLOCK@ mentions that ``[@O_NONBLOCK@] has no effect for regular files and block devices'', which means none of these three system calls are viable multiplexing strategies for these types of \io operations. Furthermore, @epoll@ has been shown to have problems with pipes and ttys~\cit{Peter's examples in some fashion}. Finally, none of these are useful solutions for multiplexing \io operations that do not have a corresponding file descriptor and can be awkward for operations using multiple file descriptors.
+\subsection{POSIX asynchronous I/O (AIO)}
+An alternative to @O_NONBLOCK@ is the AIO interface. Its interface lets programmers enqueue operations to be performed asynchronously by the kernel. Completions of these operations can be communicated in various ways: either by spawning a new \gls{kthrd}, sending a Linux signal, or by polling for completion of one or more operation. For this work, spawning a new \gls{kthrd} is counter-productive but a related solution is discussed in Section~\ref{io:morethreads}. Using interrupts handlers can also lead to fairly complicated interactions between subsystems. Leaving polling for completion, which is similar to the previous system calls. While AIO only supports read and write operations to file descriptors, it does not have the same limitation as @O_NONBLOCK@, \ie, the file descriptors can be regular files and blocked devices. It also supports batching multiple operations in a single system call.
+AIO offers two different approach to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed. For the purpose of \io multiplexing, @aio_suspend@ is the best interface. However, even if AIO requests can be submitted concurrently, @aio_suspend@ suffers from the same limitation as @select@ and @poll@, \ie, the interest set cannot be dynamically changed while a call to @aio_suspend@ is in progress. AIO also suffers from the limitation of specifying which requests have completed, \ie programmers have to poll each request in the interest set using @aio_error@ to identify the completed requests. This limitation means that, like @select@ and @poll@ but not @epoll@, the time needed to examine polling results increases based on the total number of requests monitored, not the number of completed requests.
+Finally, AIO does not seem to be a popular interface, which I believe is due in part to this poor polling interface. Linus Torvalds talks about this interface as follows:
 \begin{displayquote}
         AIO is a horrible ad-hoc design, with the main excuse being "other,
+        AIO is a horrible ad-hoc design, with the main excuse being ``other,
         less gifted people, made that design, and we are implementing it for
         compatibility because database people - who seldom have any shred of
         taste - actually use it".
+        taste - actually use it''.
         But AIO was always really really ugly.
 …
 \end{displayquote}
 Interestingly, in this e-mail answer, Linus goes on to describe
+Interestingly, in this e-mail, Linus goes on to describe
 ``a true \textit{asynchronous system call} interface''
 that does
 …
 in
 ``some kind of arbitrary \textit{queue up asynchronous system call} model''.
 This description is actually quite close to the interface of the interface described in the next section.
+This description is actually quite close to the interface described in the next section.
+\subsection{\texttt{io\_uring}}
+A very recent addition to Linux, @io_uring@\cit{io\_uring} is a framework that aims to solve many of the problems listed with the above mentioned solutions.
+\subsection{\lstinline{io_uring}}
+A very recent addition to Linux, @io_uring@~\cite{MAN:io_uring}, is a framework that aims to solve many of the problems listed in the above interfaces. Like AIO, it represents \io operations as entries added to a queue. But like @epoll@, new requests can be submitted while a blocking call waiting for requests to complete is already in progress. The @io_uring@ interface uses two ring buffers (referred to simply as rings) at its core: a submit ring to which programmers push \io requests and a completion ring from which programmers poll for completion.
+One of the big advantages over the prior interfaces is that @io_uring@ also supports a much wider range of operations. In addition to supporting reads and writes to any file descriptor like AIO, it supports other operations like @open@, @close@, @fsync@, @accept@, @connect@, @send@, @recv@, @splice@, \etc.
+On top of these, @io_uring@ adds many extras like avoiding copies between the kernel and user-space using shared memory, allowing different mechanisms to communicate with device drivers, and supporting chains of requests, \ie, requests that automatically trigger followup requests on completion.
 \subsection{Extra Kernel Threads}\label{io:morethreads}
 Finally, if the operating system does not offer any satisfying forms of asynchronous \glsxtrshort{io} operations, a solution is to fake it by creating a pool of \glspl{kthrd} and delegating operations to them in order to avoid blocking \glspl{proc}.
+Finally, if the operating system does not offer a satisfactory form of asynchronous \io operations, an ad-hoc solution is to create a pool of \glspl{kthrd} and delegate operations to it to avoid blocking \glspl{proc}, which is a compromise for multiplexing. In the worst case, where all \glspl{thrd} are consistently blocking on \io, it devolves into 1-to-1 threading. However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \glspl{thrd} are ready to run. This approach is used by languages like Go\cit{Go} and frameworks like libuv\cit{libuv}, since it has the advantage that it can easily be used across multiple operating systems. This advantage is especially relevant for languages like Go, which offer a homogeneous \glsxtrshort{api} across all platforms. As opposed to C, which has a very limited standard api for \io, \eg, the C standard library has no networking.
 \subsection{Discussion}
+These options effectively fall into two broad camps: waiting for \io to be ready versus waiting for \io to complete. All operating systems that support asynchronous \io must offer an interface along one of these lines, but the details vary drastically. For example, Free BSD offers @kqueue@~\cite{MAN:bsd/kqueue}, which behaves similarly to @epoll@, but with some small quality of use improvements, while Windows (Win32)~\cit{https://docs.microsoft.com/en-us/windows/win32/fileio/synchronous-and-asynchronous-i-o} offers ``overlapped I/O'', which handles submissions similarly to @O_NONBLOCK@ with extra flags on the synchronous system call, but waits for completion events, similarly to @io_uring@.
+For this project, I selected @io_uring@, in large parts because to its generality. While @epoll@ has been shown to be a good solution for socket \io (\cite{DBLP:journals/pomacs/KarstenB20}), @io_uring@'s transparent support for files, pipes, and more complex operations, like @splice@ and @tee@, make it a better choice as the foundation for a general \io subsystem.
 \section{Event-Engine}
+An event engine's responsibility is to use the kernel interface to multiplex many \io operations onto few \glspl{kthrd}. In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engines then starts the operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}. The parked \glspl{thrd} are then rescheduled by the event engine once the desired operation has completed.
+\subsection{\lstinline{io_uring} in depth}
+Before going into details on the design of my event engine, more details on @io_uring@ usage are presented, each important in the design of the engine.
+Figure~\ref{fig:iouring} shows an overview of an @io_uring@ instance.
+Two ring buffers are used to communicate with the kernel: one for submissions~(left) and one for completions~(right).
+The submission ring contains entries, \newterm{Submit Queue Entries} (SQE), produced (appended) by the application when an operation starts and then consumed by the kernel.
+The completion ring contains entries, \newterm{Completion Queue Entries} (CQE), produced (appended) by the kernel when an operation completes and then consumed by the application.
+The submission ring contains indexes into the SQE array (denoted \emph{S}) containing entries describing the I/O operation to start;
+the completion ring contains entries for the completed I/O operation.
+Multiple @io_uring@ instances can be created, in which case they each have a copy of the data structures in the figure.
+\begin{figure}
+        \centering
+        \input{io_uring.pstex_t}
+        \caption{Overview of \lstinline{io_uring}}
+%       \caption[Overview of \lstinline{io_uring}]{Overview of \lstinline{io_uring} \smallskip\newline Two ring buffer are used to communicate with the kernel, one for completions~(right) and one for submissions~(left). The completion ring contains entries, \newterm{CQE}s: Completion Queue Entries, that are produced by the kernel when an operation completes and then consumed by the application. On the other hand, the application produces \newterm{SQE}s: Submit Queue Entries, which it appends to the submission ring for the kernel to consume. Unlike the completion ring, the submission ring does not contain the entries directly, it indexes into the SQE array (denoted \emph{S}) instead.}
+        \label{fig:iouring}
+\end{figure}
+New \io operations are submitted to the kernel following 4 steps, which use the components shown in the figure.
+\begin{enumerate}
+\item
+An SQE is allocated from the pre-allocated array (denoted \emph{S} in Figure~\ref{fig:iouring}). This array is created at the same time as the @io_uring@ instance, is in kernel-locked memory visible by both the kernel and the application, and has a fixed size determined at creation. How these entries are allocated is not important for the functioning of @io_uring@, the only requirement is that no entry is reused before the kernel has consumed it.
+\item
+The SQE is filled according to the desired operation. This step is straight forward, the only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries.
+\item
+The SQE is submitted to the submission ring by appending the index of the SQE to the ring following regular ring buffer steps: \lstinline{buffer[head] = item; head++}. Since the head is visible to the kernel, some memory barriers may be required to prevent the compiler from reordering these operations. Since the submission ring is a regular ring buffer, more than one SQE can be added at once and the head is updated only after all entries are updated.
+\item
+The kernel is notified of the change to the ring using the system call @io_uring_enter@. The number of elements appended to the submission ring is passed as a parameter and the number of elements consumed is returned. The @io_uring@ instance can be constructed so this step is not required, but this requires elevated privilege.% and an early version of @io_uring@ had additional restrictions.
+\end{enumerate}
+\begin{sloppypar}
+The completion side is simpler: applications call @io_uring_enter@ with the flag @IORING_ENTER_GETEVENTS@ to wait on a desired number of operations to complete. The same call can be used to both submit SQEs and wait for operations to complete. When operations do complete, the kernel appends a CQE to the completion ring and advances the head of the ring. Each CQE contains the result of the operation as well as a copy of the @user_data@ field of the SQE that triggered the operation. It is not necessary to call @io_uring_enter@ to get new events because the kernel can directly modify the completion ring. The system call is only needed if the application wants to block waiting for operations to complete.
+\end{sloppypar}
+The @io_uring_enter@ system call is protected by a lock inside the kernel. This protection means that concurrent call to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@. It is possible to do the first three submission steps in parallel, however, doing so requires careful synchronization.
+@io_uring@ also introduces constraints on the number of simultaneous operations that can be ``in flight''. Obviously, SQEs are allocated from a fixed-size array, meaning that there is a hard limit to how many SQEs can be submitted at once. In addition, the @io_uring_enter@ system call can fail because ``The  kernel [...] ran out of resources to handle [a request]'' or ``The application is attempting to overcommit the number of requests it can  have  pending.''. This restriction means \io request bursts may have to be subdivided and submitted in chunks at a later time.
+\subsection{Multiplexing \io: Submission}
+The submission side is the most complicated aspect of @io_uring@ and its design largely dictates the completion side.
+While it is possible to do the first steps of submission in parallel, the duration of the system call scales with number of entries submitted. The consequence is that the amount of parallelism used to prepare submissions for the next system call is limited. Beyond this limit, the length of the system call is the throughput limiting factor. I concluded from early experiments that preparing submissions seems to take about as long as the system call itself, which means that with a single @io_uring@ instance, there is no benefit in terms of \io throughput to having more than two \glspl{hthrd}. Therefore the design of the submission engine must manage multiple instances of @io_uring@ running in parallel, effectively sharding @io_uring@ instances. Similarly to scheduling, this sharding can be done privately, \ie, one instance per \glspl{proc}, or in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}.
+\subsubsection{Pool of Instances}
+One approach is to have multiple shared instances. \Glspl{thrd} attempting \io operations pick one of the available instances and submits operations to that instance. Since the completion will be sent to the same instance, all instances with pending operations must be polled continuously\footnote{As will be described in Chapter~\ref{practice}, this does not translate into constant CPU usage.}. Since there is no coupling between \glspl{proc} and @io_uring@ instances in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently. Since @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects: the synchronization needed to submit does not induce more contention than @io_uring@ already does and the scheme to route \io requests to specific @io_uring@ instances does not introduce contention. This second aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
+Allocation in this scheme can be handled fairly easily. Free SQEs, \ie, SQEs that aren't currently being used to represent a request, can be written to safely and have a field called @user_data@ which the kernel only reads to copy to CQEs. Allocation also requires no ordering guarantee as all free SQEs are interchangeable. This requires a simple concurrent bag. The only added complexity is that the number of SQEs is fixed, which means allocation can fail. This failure needs to be pushed up to the routing algorithm, \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without any available SQEs. Ideally, the routing algorithm would block operations up-front if none of the instances have available SQEs.
+Once an SQE is allocated, \glspl{thrd} can fill them normally, they simply need to keep track of the SQE index and which instance it belongs to.
+Once an SQE is filled in, what needs to happen is that the SQE must be added to the submission ring buffer, an operation that is not thread-safe on itself, and the kernel must be notified using the @io_uring_enter@ system call. The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail\footnote{This is because it is invalid to have the same \lstinline{sqe} multiple times in the ring buffer.}. However, as mentioned, the system call itself can fail with the expectation that it will be retried once some of the already submitted operations complete. Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency. Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted. This can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section.
+In the case of designating a \gls{thrd}, ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests would be batched together and one of the \glspl{thrd} would do the system call on behalf of the others, referred to as the \newterm{submitter}. In practice however, it is important that the \io requests are not left pending indefinitely and as such, it may be required to have a current submitter and a next submitter. Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call will include their request. Once the system call is done, the submitter must also free SQEs so that the allocator can reused them.
+Finally, the completion side is much simpler since the @io_uring@ system call enforces a natural synchronization point. Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}. Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}. If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events.  A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled. This design is especially convenient for reasons explained in Chapter~\ref{practice}.
+With this pool of instances approach, the big advantage is that it is fairly flexible. It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions. It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@. The down side to this is that many of the steps used for submitting need complex synchronization to work properly. The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed. The submission side needs to safely append SQEs to the ring buffer, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused and handle the kernel returning @EBUSY@. Sharding the @io_uring@ instances should alleviate much of the contention caused by this, but all this synchronization may still have non-zero cost.
+\subsubsection{Private Instances}
+Another approach is to simply create one ring instance per \gls{proc}. This alleviate the need for synchronization on the submissions, requiring only that \glspl{thrd} are not interrupted in between two submission steps. This is effectively the same requirement as using @thread_local@ variables. Since SQEs that are allocated must be submitted to the same ring, on the same \gls{proc}, this effectively forces the application to submit SQEs in allocation order\footnote{The actual requirement is that \glspl{thrd} cannot context switch between allocation and submission. This requirement means that from the subsystem's point of view, the allocation and submission are sequential. To remove this requirement, a \gls{thrd} would need the ability to ``yield to a specific \gls{proc}'', \ie, park with the promise that it will be run next on a specific \gls{proc}, the \gls{proc} attached to the correct ring. This is not a current or planned feature of \CFA.}, greatly simplifying both allocation and submission. In this design, allocation and submission form a ring partitioned ring buffer as shown in Figure~\ref{fig:pring}. Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to do the system call. Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of threads \glspl{thrd}, etc.
+\begin{figure}
+        \centering
+        \input{pivot_ring.pstex_t}
+        \caption[Partitioned ring buffer]{Partitioned ring buffer \smallskip\newline Allocated sqes are appending to the first partition. When submitting, the partition is simply advanced to include all the sqes that should be submitted. The kernel considers the partition as the head of the ring.}
+        \label{fig:pring}
+\end{figure}
+This approach has the advantage that it does not require much of the synchronization needed in the shared approach. This comes at the cost that \glspl{thrd} submitting \io operations have less flexibility, they cannot park or yield, and several exceptional cases are handled poorly. Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations, in such a case the \gls{thrd} needs to be moved to a different \gls{proc}, the only current way of achieving this would be to @yield()@ hoping to be scheduled on a different \gls{proc}, which is not guaranteed. Another problematic case is that \glspl{thrd} that do not park for long periods of time will delay the submission of any SQE not already submitted. This issue is similar to fairness issues which schedulers that use work-stealing mentioned in the previous chapter.
 \section{Interface}
+Finally, the last important part of the \io subsystem is it's interface. There are multiple approaches that can be offered to programmers, each with advantages and disadvantages. The new \io subsystem can replace the C runtime's API or extend it. And in the later case the interface can go from very similar to vastly different. The following sections discuss some useful options using @read@ as an example. The standard Linux interface for C is :
+@ssize_t read(int fd, void *buf, size_t count);@.
+\subsection{Replacement}
+Replacing the C \glsxtrshort{api}
+\subsection{Synchronous Extension}
+\subsection{Asynchronous Extension}
+\subsection{Interface directly to \lstinline{io_uring}}

doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex

-              r342af53
+              r8e4aa05
 \section{Clusters}
 \CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} are only be scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. It also opens the door to handling effects like NUMA, by pining clusters to a specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
+\CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. It also opens the door to handling effects like NUMA, by pining clusters to a specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
 \begin{figure}
 …
 \section{\glsxtrshort{io}}\label{prev:io}
 Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. %\CFA being built on C, this means that,
+While all I/O operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means I/O operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
+Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. While all \glsxtrshort{io} operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means \glsxtrshort{io} operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
 \begin{quote}
 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request. With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
 \end{quote}
-Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, like \glslink{uthrding}{User-Level \emph{Threading}} blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations, which entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple I/O operations in parallel. This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. Executing I/O operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block.
+\section{Interoperating with C}
+Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, like \glslink{uthrding}{User-Level \emph{Threading}} blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations, which entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple \glsxtrshort{io} operations in parallel. This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. Executing \glsxtrshort{io} operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block.
+\section{Interoperating with \texttt{C}}
 While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the non-blocking challenge extends to all blocking system-calls. The POSIX standard states~\cite[\S~2.9.1]{POSIX17}:
 \begin{quote}
 …
 \begin{enumerate}
         \item Precisely identifying blocking C calls is difficult.
         \item Introducing new code can have a significant impact on general performance.
+        \item Introducing control points code can have a significant impact on general performance.
 \end{enumerate}
 Because of these consequences, this work does not attempt to ``sandbox'' calls to C. Therefore, it is possible for an unidentified library calls to block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. Therefore, a complete solution to this problem is outside the scope of this thesis.
+Because of these consequences, this work does not attempt to ``sandbox'' calls to C. Therefore, it is possible calls from an unidentified library will block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. Therefore, a complete solution to this problem is outside the scope of this thesis.

doc/theses/thierry_delisle_PhD/thesis/thesis.tex

-              r342af53
+              r8e4aa05
+% uWaterloo Thesis Template for LaTeX
+% Last Updated June 14, 2017 by Stephen Carr, IST Client Services
+% FOR ASSISTANCE, please send mail to rt-IST-CSmathsci@ist.uwaterloo.ca
+% Effective October 2006, the University of Waterloo
+% requires electronic thesis submission. See the uWaterloo thesis regulations at
+%======================================================================
+% University of Waterloo Thesis Template for LaTeX
+% Last Updated November, 2020
+% by Stephen Carr, IST Client Services,
+% University of Waterloo, 200 University Ave. W., Waterloo, Ontario, Canada
+% FOR ASSISTANCE, please send mail to request@uwaterloo.ca
+% DISCLAIMER
+% To the best of our knowledge, this template satisfies the current uWaterloo thesis requirements.
+% However, it is your responsibility to assure that you have met all requirements of the University and your particular department.
+% Many thanks for the feedback from many graduates who assisted the development of this template.
+% Also note that there are explanatory comments and tips throughout this template.
+%======================================================================
+% Some important notes on using this template and making it your own...
+% The University of Waterloo has required electronic thesis submission since October 2006.
+% See the uWaterloo thesis regulations at
 % https://uwaterloo.ca/graduate-studies/thesis.
+% DON'T FORGET TO ADD YOUR OWN NAME AND TITLE in the "hyperref" package
+% configuration below. THIS INFORMATION GETS EMBEDDED IN THE PDF FINAL PDF DOCUMENT.
+% You can view the information if you view Properties of the PDF document.
+% Many faculties/departments also require one or more printed
+% copies. This template attempts to satisfy both types of output.
+% It is based on the standard "book" document class which provides all necessary
+% sectioning structures and allows multi-part theses.
+% DISCLAIMER
+% To the best of our knowledge, this template satisfies the current uWaterloo requirements.
+% However, it is your responsibility to assure that you have met all
+% requirements of the University and your particular department.
+% Many thanks for the feedback from many graduates that assisted the development of this template.
+% -----------------------------------------------------------------------
+% By default, output is produced that is geared toward generating a PDF
+% version optimized for viewing on an electronic display, including
+% hyperlinks within the PDF.
+% This thesis template is geared towards generating a PDF version optimized for viewing on an electronic display, including hyperlinks within the PDF.
+% DON'T FORGET TO ADD YOUR OWN NAME AND TITLE in the "hyperref" package configuration below.
+% THIS INFORMATION GETS EMBEDDED IN THE PDF FINAL PDF DOCUMENT.
+% You can view the information if you view properties of the PDF document.
+% Many faculties/departments also require one or more printed copies.
+% This template attempts to satisfy both types of output.
+% See additional notes below.
+% It is based on the standard "book" document class which provides all necessary sectioning structures and allows multi-part theses.
+% If you are using this template in Overleaf (cloud-based collaboration service), then it is automatically processed and previewed for you as you edit.
+% For people who prefer to install their own LaTeX distributions on their own computers, and process the source files manually, the following notes provide the sequence of tasks:
 % E.g. to process a thesis called "mythesis.tex" based on this template, run:
 % pdflatex mythesis     -- first pass of the pdflatex processor
 % bibtex mythesis       -- generates bibliography from .bib data file(s)
 % makeindex         -- should be run only if an index is used
+% makeindex         -- should be run only if an index is used
 % pdflatex mythesis     -- fixes numbering in cross-references, bibliographic references, glossaries, index, etc.
+% pdflatex mythesis     -- fixes numbering in cross-references, bibliographic references, glossaries, index, etc.
+% If you use the recommended LaTeX editor, Texmaker, you would open the mythesis.tex
+% file, then click the PDFLaTeX button. Then run BibTeX (under the Tools menu).
+% Then click the PDFLaTeX button two more times. If you have an index as well,
+% you'll need to run MakeIndex from the Tools menu as well, before running pdflatex
+% pdflatex mythesis     -- it takes a couple of passes to completely process all cross-references
+% If you use the recommended LaTeX editor, Texmaker, you would open the mythesis.tex file, then click the PDFLaTeX button. Then run BibTeX (under the Tools menu).
+% Then click the PDFLaTeX button two more times.
+% If you have an index as well,you'll need to run MakeIndex from the Tools menu as well, before running pdflatex
 % the last two times.
+% N.B. The "pdftex" program allows graphics in the following formats to be
+% included with the "\includegraphics" command: PNG, PDF, JPEG, TIFF
+% Tip 1: Generate your figures and photos in the size you want them to appear
+% in your thesis, rather than scaling them with \includegraphics options.
+% Tip 2: Any drawings you do should be in scalable vector graphic formats:
+% SVG, PNG, WMF, EPS and then converted to PNG or PDF, so they are scalable in
+% the final PDF as well.
+% Tip 3: Photographs should be cropped and compressed so as not to be too large.
+% To create a PDF output that is optimized for double-sided printing:
+%
+% 1) comment-out the \documentclass statement in the preamble below, and
+% un-comment the second \documentclass line.
+%
+% 2) change the value assigned below to the boolean variable
+% "PrintVersion" from "false" to "true".
+% --------------------- Start of Document Preamble -----------------------
+% Specify the document class, default style attributes, and page dimensions
+% N.B. The "pdftex" program allows graphics in the following formats to be included with the "\includegraphics" command: PNG, PDF, JPEG, TIFF
+% Tip: Generate your figures and photos in the size you want them to appear in your thesis, rather than scaling them with \includegraphics options.
+% Tip: Any drawings you do should be in scalable vector graphic formats: SVG, PNG, WMF, EPS and then converted to PNG or PDF, so they are scalable in the final PDF as well.
+% Tip: Photographs should be cropped and compressed so as not to be too large.
+% To create a PDF output that is optimized for double-sided printing:
+% 1) comment-out the \documentclass statement in the preamble below, and un-comment the second \documentclass line.
+% 2) change the value assigned below to the boolean variable "PrintVersion" from " false" to "true".
+%======================================================================
+%   D O C U M E N T   P R E A M B L E
+% Specify the document class, default style attributes, and page dimensions, etc.
 % For hyperlinked PDF, suitable for viewing on a computer, use this:
 \documentclass[letterpaper,12pt,titlepage,oneside,final]{book}
+% For PDF, suitable for double-sided printing, change the PrintVersion variable below
+% to "true" and use this \documentclass line instead of the one above:
+% For PDF, suitable for double-sided printing, change the PrintVersion variable below to "true" and use this \documentclass line instead of the one above:
 %\documentclass[letterpaper,12pt,titlepage,openright,twoside,final]{book}
+\newcommand{\href}[1]{#1} % does nothing, but defines the command so the
+    % print-optimized version will ignore \href tags (redefined by hyperref pkg).
+% Some LaTeX commands I define for my own nomenclature.
+% If you have to, it's easier to make changes to nomenclature once here than in a million places throughout your thesis!
+\newcommand{\package}[1]{\textbf{#1}} % package names in bold text
+\newcommand{\cmmd}[1]{\textbackslash\texttt{#1}} % command name in tt font
+\newcommand{\href}[1]{#1} % does nothing, but defines the command so the print-optimized version will ignore \href tags (redefined by hyperref pkg).
+%\newcommand{\texorpdfstring}[2]{#1} % does nothing, but defines the command
+% Anything defined here may be redefined by packages added below...
 % This package allows if-then-else control structures.
 …
 \newboolean{PrintVersion}
 \setboolean{PrintVersion}{false}
+% CHANGE THIS VALUE TO "true" as necessary, to improve printed results for hard copies
+% by overriding some options of the hyperref package below.
+% CHANGE THIS VALUE TO "true" as necessary, to improve printed results for hard copies by overriding some options of the hyperref package, called below.
 %\usepackage{nomencl} % For a nomenclature (optional; available from ctan.org)
 \usepackage{amsmath,amssymb,amstext} % Lots of math symbols and environments
+\usepackage{xcolor}
 \usepackage{graphicx} % For including graphics
 % Hyperlinks make it very easy to navigate an electronic document.
+% In addition, this is where you should specify the thesis title
+% and author as they appear in the properties of the PDF document.
+% In addition, this is where you should specify the thesis title and author as they appear in the properties of the PDF document.
 % Use the "hyperref" package
 % N.B. HYPERREF MUST BE THE LAST PACKAGE LOADED; ADD ADDITIONAL PKGS ABOVE
 \usepackage[pagebackref=false]{hyperref} % with basic options
+                % N.B. pagebackref=true provides links back from the References to the body text. This can cause trouble for printing.
+%\usepackage[pdftex,pagebackref=true]{hyperref}
+% N.B. pagebackref=true provides links back from the References to the body text. This can cause trouble for printing.
 \hypersetup{
         plainpages=false,       % needed if Roman numbers in frontpages
         unicode=false,          % non-Latin characters in Acrobat’s bookmarks
         pdftoolbar=true,        % show Acrobat’s toolbar?
         pdfmenubar=true,        % show Acrobat’s menu?
+        unicode=false,          % non-Latin characters in Acrobat's bookmarks
+        pdftoolbar=true,        % show Acrobat's toolbar?
+        pdfmenubar=true,        % show Acrobat's menu?
         pdffitwindow=false,     % window fit to page when opened
         pdfstartview={FitH},    % fits the width of the page to the window
 …
 \ifthenelse{\boolean{PrintVersion}}{   % for improved print quality, change some hyperref options
 \hypersetup{    % override some previously defined hyperref options
         citecolor=black,
         filecolor=black,
         linkcolor=black,
+        citecolor=black,%
+        filecolor=black,%
+        linkcolor=black,%
         urlcolor=black
 }}{} % end of ifthenelse (no else)
 …
 % although it's supposed to be in both the TeX Live and MikTeX distributions. There are also documentation and
 % installation instructions there.
+\renewcommand*{\glstextformat}[1]{\textsf{#1}}
+\makeatletter
+\newcommand*{\glsplainhyperlink}[2]{%
+  \colorlet{currenttext}{.}% store current text color
+  \colorlet{currentlink}{\@linkcolor}% store current link color
+  \hypersetup{linkcolor=currenttext}% set link color
+  \hyperlink{#1}{#2}%
+  \hypersetup{linkcolor=currentlink}% reset to default
+}
+\let\@glslink\glsplainhyperlink
+\makeatother
 \usepackage{csquotes}
 …
 % Setting up the page margins...
+\setlength{\textheight}{9in}\setlength{\topmargin}{-0.45in}\setlength{\headsep}{0.25in}
+\setlength{\textheight}{9in}
+\setlength{\topmargin}{-0.45in}
+\setlength{\headsep}{0.25in}
 % uWaterloo thesis requirements specify a minimum of 1 inch (72pt) margin at the
+% top, bottom, and outside page edges and a 1.125 in. (81pt) gutter
+% margin (on binding side). While this is not an issue for electronic
+% viewing, a PDF may be printed, and so we have the same page layout for
+% both printed and electronic versions, we leave the gutter margin in.
+% top, bottom, and outside page edges and a 1.125 in. (81pt) gutter margin (on binding side).
+% While this is not an issue for electronic viewing, a PDF may be printed, and so we have the same page layout for both printed and electronic versions, we leave the gutter margin in.
 % Set margins to minimum permitted by uWaterloo thesis regulations:
 \setlength{\marginparwidth}{0pt} % width of margin notes
 …
 \setlength{\evensidemargin}{0.125in} % Adds 1/8 in. to binding side of all
 % even-numbered pages when the "twoside" printing option is selected
+\setlength{\oddsidemargin}{0.125in} % Adds 1/8 in. to the left of all pages
+% when "oneside" printing is selected, and to the left of all odd-numbered
+% pages when "twoside" printing is selected
+\setlength{\textwidth}{6.375in} % assuming US letter paper (8.5 in. x 11 in.) and
+% side margins as above
+\setlength{\oddsidemargin}{0.125in} % Adds 1/8 in. to the left of all pages when "oneside" printing is selected, and to the left of all odd-numbered pages when "twoside" printing is selected
+\setlength{\textwidth}{6.375in} % assuming US letter paper (8.5 in. x 11 in.) and side margins as above
 \raggedbottom
+% The following statement specifies the amount of space between
+% paragraphs. Other reasonable specifications are \bigskipamount and \smallskipamount.
+% The following statement specifies the amount of space between paragraphs. Other reasonable specifications are \bigskipamount and \smallskipamount.
 \setlength{\parskip}{\medskipamount}
+% The following statement controls the line spacing.  The default
+% spacing corresponds to good typographic conventions and only slight
+% changes (e.g., perhaps "1.2"), if any, should be made.
+% The following statement controls the line spacing.
+% The default spacing corresponds to good typographic conventions and only slight changes (e.g., perhaps "1.2"), if any, should be made.
 \renewcommand{\baselinestretch}{1} % this is the default line space setting
+% By default, each chapter will start on a recto (right-hand side)
+% page.  We also force each section of the front pages to start on
+% a recto page by inserting \cleardoublepage commands.
+% In many cases, this will require that the verso page be
+% blank and, while it should be counted, a page number should not be
+% printed.  The following statements ensure a page number is not
+% printed on an otherwise blank verso page.
+% By default, each chapter will start on a recto (right-hand side) page.
+% We also force each section of the front pages to start on a recto page by inserting \cleardoublepage commands.
+% In many cases, this will require that the verso (left-hand) page be blank, and while it should be counted, a page number should not be printed.
+% The following statements ensure a page number is not printed on an otherwise blank verso page.
 \let\origdoublepage\cleardoublepage
 \newcommand{\clearemptydoublepage}{%
 …
 \input{common}
 \CFAStyle                                               % CFA code-style for all languages
 \lstset{basicstyle=\linespread{0.9}\tt}
+\lstset{language=CFA,basicstyle=\linespread{0.9}\tt}    % CFA default language
 % glossary of terms to use
 …
 \makeindex
+%======================================================================
+%   L O G I C A L    D O C U M E N T -- the content of your thesis
+\newcommand\io{\glsxtrshort{io}\xspace}%
+%======================================================================
+%   L O G I C A L    D O C U M E N T
+% The logical document contains the main content of your thesis.
+% Being a large document, it is a good idea to divide your thesis into several files, each one containing one chapter or other significant chunk of content, so you can easily shuffle things around later if desired.
 %======================================================================
 \begin{document}
-% For a large document, it is a good idea to divide your thesis
-% into several files, each one containing one chapter.
-% To illustrate this idea, the "front pages" (i.e., title page,
-% declaration, borrowers' page, abstract, acknowledgements,
-% dedication, table of contents, list of tables, list of figures,
-% nomenclature) are contained within the file "uw-ethesis-frontpgs.tex" which is
-% included into the document by the following statement.
 %----------------------------------------------------------------------
 % FRONT MATERIAL
+% title page,declaration, borrowers' page, abstract, acknowledgements,
+% dedication, table of contents, list of tables, list of figures, nomenclature, etc.
 %----------------------------------------------------------------------
 \input{text/front.tex}
 %----------------------------------------------------------------------
 % MAIN BODY
 %----------------------------------------------------------------------
 % Because this is a short document, and to reduce the number of files
 % needed for this template, the chapters are not separate
 % documents as suggested above, but you get the idea. If they were
 % separate documents, they would each start with the \chapter command, i.e,
+% do not contain \documentclass or \begin{document} and \end{document} commands.
+% We suggest using a separate file for each chapter of your thesis.
+% Start each chapter file with the \chapter command.
+% Only use \documentclass or \begin{document} and \end{document} commands in this master document.
+% Tip: Putting each sentence on a new line is a way to simplify later editing.
+%----------------------------------------------------------------------
 \part{Introduction}
 \input{text/intro.tex}
 …
 \part{Design}
 \input{text/core.tex}
+\input{text/io.tex}
 \input{text/practice.tex}
-\input{text/io.tex}
 \part{Evaluation}
 \label{Evaluation}
 …
 %----------------------------------------------------------------------
 % END MATERIAL
+%----------------------------------------------------------------------
+% B I B L I O G R A P H Y
+% -----------------------
+% The following statement selects the style to use for references.  It controls the sort order of the entries in the bibliography and also the formatting for the in-text labels.
+% Bibliography, Appendices, Index, etc.
+%----------------------------------------------------------------------
+% Bibliography
+% The following statement selects the style to use for references.
+% It controls the sort order of the entries in the bibliography and also the formatting for the in-text labels.
 \bibliographystyle{plain}
 % This specifies the location of the file containing the bibliographic information.
+% It assumes you're using BibTeX (if not, why not?).
+\cleardoublepage % This is needed if the book class is used, to place the anchor in the correct page,
+                 % because the bibliography will start on its own page.
+                 % Use \clearpage instead if the document class uses the "oneside" argument
+% It assumes you're using BibTeX to manage your references (if not, why not?).
+\cleardoublepage % This is needed if the "book" document class is used, to place the anchor in the correct page, because the bibliography will start on its own page.
+% Use \clearpage instead if the document class uses the "oneside" argument
 \phantomsection  % With hyperref package, enables hyperlinking from the table of contents to bibliography
 % The following statement causes the title "References" to be used for the bibliography section:
 …
 \bibliography{local,pl}
 % Tip 5: You can create multiple .bib files to organize your references.
+% Tip: You can create multiple .bib files to organize your references.
 % Just list them all in the \bibliogaphy command, separated by commas (no spaces).
 % % The following statement causes the specified references to be added to the bibliography% even if they were not
 % % cited in the text. The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
+% The following statement causes the specified references to be added to the bibliography even if they were not cited in the text.
+% The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
 % \nocite{*}
+%----------------------------------------------------------------------
+% Appendices
 % The \appendix statement indicates the beginning of the appendices.
 \appendix
 % Add a title page before the appendices and a line in the Table of Contents
+% Add an un-numbered title page before the appendices and a line in the Table of Contents
 \chapter*{APPENDICES}
 \addcontentsline{toc}{chapter}{APPENDICES}
+% Appendices are just more chapters, with different labeling (letters instead of numbers).
 %======================================================================
 \chapter[PDF Plots From Matlab]{Matlab Code for Making a PDF Plot}
 …
 %\input{thesis.ind}                             % index
+\phantomsection
+\end{document}
+\phantomsection         % allows hyperref to link to the correct page
+%----------------------------------------------------------------------
+\end{document} % end of logical document

doc/user/figures/Cdecl.fig

-              r342af53
+              r8e4aa05
 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 1200 3600 1200 3600 1350 2850 1350 2850 1200
 1 0 50 -1 4 10 0.0000 2 120 90 2925 1325 0\001
 1 0 50 -1 4 10 0.0000 2 120 90 3075 1325 1\001
 1 0 50 -1 4 10 0.0000 2 120 90 3225 1325 2\001
 1 0 50 -1 4 10 0.0000 2 120 90 3375 1325 3\001
 1 0 50 -1 4 10 0.0000 2 120 90 3525 1325 4\001
+1 0 50 -1 4 11 0.0000 2 120 90 2925 1325 0\001
+1 0 50 -1 4 11 0.0000 2 120 90 3075 1325 1\001
+1 0 50 -1 4 11 0.0000 2 120 90 3225 1325 2\001
+1 0 50 -1 4 11 0.0000 2 120 90 3375 1325 3\001
+1 0 50 -1 4 11 0.0000 2 120 90 3525 1325 4\001
 -6
 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 …
 1 1.00 45.00 60.00
 1275 2850 1275
 1 0 50 -1 4 10 0.0000 2 120 90 1350 1650 0\001
 1 0 50 -1 4 10 0.0000 2 120 90 1500 1650 1\001
 1 0 50 -1 4 10 0.0000 2 120 90 1650 1650 2\001
 1 0 50 -1 4 10 0.0000 2 120 90 1800 1650 3\001
 1 0 50 -1 4 10 0.0000 2 120 90 1950 1650 4\001
 1 0 50 -1 4 10 0.0000 2 90 90 1200 1325 x\001
 1 0 50 -1 4 10 0.0000 2 90 90 2400 1325 x\001
+1 0 50 -1 4 11 0.0000 2 120 90 1350 1650 0\001
+1 0 50 -1 4 11 0.0000 2 120 90 1500 1650 1\001
+1 0 50 -1 4 11 0.0000 2 120 90 1650 1650 2\001
+1 0 50 -1 4 11 0.0000 2 120 90 1800 1650 3\001
+1 0 50 -1 4 11 0.0000 2 120 90 1950 1650 4\001
+1 0 50 -1 4 11 0.0000 2 90 90 1200 1325 x\001
+1 0 50 -1 4 11 0.0000 2 90 90 2400 1325 x\001

doc/user/user.tex

-              r342af53
+              r8e4aa05
 %% Created On       : Wed Apr  6 14:53:29 2016
 %% Last Modified By : Peter A. Buhr
 %% Last Modified On : Mon Oct  5 08:57:29 2020
 %% Update Count     : 3998
+%% Last Modified On : Mon Feb 15 13:48:53 2021
+%% Update Count     : 4452
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 \usepackage{mathptmx}                                   % better math font with "times"
 \usepackage[usenames]{color}
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage{breakurl}
+\renewcommand\footnoterule{\kern -3pt\rule{0.3\linewidth}{0.15pt}\kern 2pt}
+\usepackage[pagewise]{lineno}
+\renewcommand{\linenumberfont}{\scriptsize\sffamily}
+\usepackage[firstpage]{draftwatermark}
+\SetWatermarkLightness{0.9}
+% Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore
+% removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR
+% AFTER HYPERREF.
+\renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}}
+\setlength{\topmargin}{-0.45in}                                                 % move running title into header
+\setlength{\headsep}{0.25in}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \newcommand{\CFALatin}{}
 % inline code ©...© (copyright symbol) emacs: C-q M-)
 …
 % math escape $...$ (dollar symbol)
 \input{common}                                          % common CFA document macros
-\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
-\usepackage{breakurl}
-\renewcommand\footnoterule{\kern -3pt\rule{0.3\linewidth}{0.15pt}\kern 2pt}
-\usepackage[pagewise]{lineno}
-\renewcommand{\linenumberfont}{\scriptsize\sffamily}
-\usepackage[firstpage]{draftwatermark}
-\SetWatermarkLightness{0.9}
-% Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore
-% removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR
-% AFTER HYPERREF.
-\renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}}
-\setlength{\topmargin}{-0.45in}                                                 % move running title into header
-\setlength{\headsep}{0.25in}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \CFAStyle                                                                                               % use default CFA format-style
+\lstset{language=CFA}                                                                   % CFA default lnaguage
 \lstnewenvironment{C++}[1][]                            % use C++ style
 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}}
+{\lstset{language=C++,moredelim=**[is][\protect\color{red}]{@}{@},#1}}
 {}
 …
 \newcommand{\Emph}[2][red]{{\color{#1}\textbf{\emph{#2}}}}
 \newcommand{\R}[1]{\Textbf{#1}}
+\newcommand{\RC}[1]{\Textbf{\LstBasicStyle{#1}}}
 \newcommand{\B}[1]{{\Textbf[blue]{#1}}}
 \newcommand{\G}[1]{{\Textbf[OliveGreen]{#1}}}
 …
 \author{
+\huge \CFA Team \medskip \\
+\Large Andrew Beach, Richard Bilson, Peter A. Buhr, Thierry Delisle, \smallskip \\
+\Large Glen Ditchfield, Rodolfo G. Esteves, Aaron Moss, Rob Schluntz
+\huge \CFA Team (past and present) \medskip \\
+\Large Andrew Beach, Richard Bilson, Michael Brooks, Peter A. Buhr, Thierry Delisle, \smallskip \\
+\Large Glen Ditchfield, Rodolfo G. Esteves, Aaron Moss, Colby Parsons, Rob Schluntz, \smallskip \\
+\Large Fangren Yu, Mubeen Zulfiqar
 }% author
 …
 \vspace*{\fill}
 \noindent
 \copyright\,2016 \CFA Project \\ \\
+\copyright\,2016, 2018, 2021 \CFA Project \\ \\
 \noindent
 This work is licensed under the Creative Commons Attribution 4.0 International License.
 …
 \section{Introduction}
 \CFA{}\index{cforall@\CFA}\footnote{Pronounced ``\Index*{C-for-all}'', and written \CFA, CFA, or \CFL.} is a modern general-purpose programming-language, designed as an evolutionary step forward for the C programming language.
+\CFA{}\index{cforall@\CFA}\footnote{Pronounced ``\Index*{C-for-all}'', and written \CFA, CFA, or \CFL.} is a modern general-purpose concurrent programming-language, designed as an evolutionary step forward for the C programming language.
 The syntax of \CFA builds from C and should look immediately familiar to C/\Index*[C++]{\CC{}} programmers.
 % Any language feature that is not described here can be assumed to be using the standard \Celeven syntax.
 \CFA adds many modern programming-language features that directly lead to increased \emph{\Index{safety}} and \emph{\Index{productivity}}, while maintaining interoperability with existing C programs and achieving similar performance.
+\CFA adds many modern features that directly lead to increased \emph{\Index{safety}} and \emph{\Index{productivity}}, while maintaining interoperability with existing C programs and achieving similar performance.
 Like C, \CFA is a statically typed, procedural (non-\Index{object-oriented}) language with a low-overhead runtime, meaning there is no global \Index{garbage-collection}, but \Index{regional garbage-collection}\index{garbage-collection!regional} is possible.
 The primary new features include polymorphic routines and types, exceptions, concurrency, and modules.
 …
 instead, a programmer evolves a legacy program into \CFA by incrementally incorporating \CFA features.
 As well, new programs can be written in \CFA using a combination of C and \CFA features.
+In many ways, \CFA is to C as \Index{Scala}~\cite{Scala} is to Java, providing a vehicle for new typing and control-flow capabilities on top of a highly popular programming language allowing immediate dissemination.
 \Index*[C++]{\CC{}}~\cite{c++:v1} had a similar goal 30 years ago, allowing object-oriented programming to be incrementally added to C.
 …
 For example, the following programs compare the C, \CFA, and \CC I/O mechanisms, where the programs output the same result.
 \begin{center}
 \begin{tabular}{@{}l@{\hspace{1.5em}}l@{\hspace{1.5em}}l@{}}
 \multicolumn{1}{c@{\hspace{1.5em}}}{\textbf{C}} & \multicolumn{1}{c}{\textbf{\CFA}}     & \multicolumn{1}{c}{\textbf{\CC}}      \\
 \begin{cfa}
 #include <stdio.h>§\indexc{stdio.h}§
+\begin{tabular}{@{}l@{\hspace{1em}}l@{\hspace{1em}}l@{}}
+\multicolumn{1}{c@{\hspace{1em}}}{\textbf{C}}   & \multicolumn{1}{c}{\textbf{\CFA}}     & \multicolumn{1}{c}{\textbf{\CC}}      \\
+\begin{cfa}
+#include <stdio.h>$\indexc{stdio.h}$
 int main( void ) {
         int x = 0, y = 1, z = 2;
         ®printf( "%d %d %d\n", x, y, z );®
+        @printf( "%d %d %d\n", x, y, z );@
+}
 \end{cfa}
+&
 \begin{cfa}
 #include <fstream>§\indexc{fstream}§
+#include <fstream>$\indexc{fstream}$
 int main( void ) {
         int x = 0, y = 1, z = 2;
         ®sout | x | y | z;®§\indexc{sout}§
+        @sout | x | y | z;@$\indexc{sout}$
+}
 \end{cfa}
+&
 \begin{cfa}
 #include <iostream>§\indexc{iostream}§
+#include <iostream>$\indexc{iostream}$
 using namespace std;
 int main() {
         int x = 0, y = 1, z = 2;
         ®cout<<x<<" "<<y<<" "<<z<<endl;®
+        @cout<<x<<" "<<y<<" "<<z<<endl;@
+}
 \end{cfa}
 \end{tabular}
 \end{center}
 While the \CFA I/O looks similar to the \Index*[C++]{\CC{}} output style, there are important differences, such as automatic spacing between variables as in \Index*{Python} (see~\VRef{s:IOLibrary}).
+While \CFA I/O \see{\VRef{s:StreamIOLibrary}} looks similar to \Index*[C++]{\CC{}}, there are important differences, such as automatic spacing between variables and an implicit newline at the end of the expression list, similar to \Index*{Python}~\cite{Python}.
 …
 \section{Why fix C?}
 The C programming language is a foundational technology for modern computing with millions of lines of code implementing everything from hobby projects to commercial operating-systems.
+The C programming language is a foundational technology for modern computing with billions of lines of code implementing everything from hobby projects to commercial operating-systems.
 This installation base and the programmers producing it represent a massive software-engineering investment spanning decades and likely to continue for decades more.
 Even with all its problems, C continues to be popular because it allows writing software at virtually any level in a computer system without restriction.
 For system programming, where direct access to hardware, storage management, and real-time issues are a requirement, C is usually the only language of choice.
 The TIOBE index~\cite{TIOBE} for February 2020 ranks the top six most \emph{popular} programming languages as \Index*{Java} 17.4\%, C 16.8\%, Python 9.3\%, \Index*[C++]{\CC{}} 6.2\%, \Csharp 5.9\%, Visual Basic 5.9\% = 61.5\%, where the next 50 languages are less than 2\% each, with a long tail.
+For system programming, where direct access to hardware, storage management, and real-time issues are a requirement, C is the only language of choice.
+The TIOBE index~\cite{TIOBE} for February 2021 ranks the top six most \emph{popular} programming languages as C 17.4\%, \Index*{Java} 12\%, Python 12\%, \Index*[C++]{\CC{}} 7.6\%, \Csharp 4\%, Visual Basic 3.8\% = 56.8\%, where the next 50 languages are less than 2\% each, with a long tail.
 The top 4 rankings over the past 35 years are:
 \begin{center}
 \setlength{\tabcolsep}{10pt}
 \begin{tabular}{@{}rcccccccc@{}}
                 & 2020  & 2015  & 2010  & 2005  & 2000  & 1995  & 1990  & 1985  \\ \hline
 Java    & 1             & 2             & 1             & 2             & 3             & -             & -             & -             \\
 \R{C}   & \R{2} & \R{1} & \R{2} & \R{1} & \R{1} & \R{2} & \R{1} & \R{1} \\
 Python  & 3             & 7             & 6             & 6             & 22    & 21    & -             & -             \\
 \CC             & 4             & 4             & 4             & 3             & 2             & 1             & 2             & 12    \\
+                & 2021  & 2016  & 2011  & 2006  & 2001  & 1996  & 1991  & 1986  \\ \hline
+\R{C}   & \R{1} & \R{2} & \R{2} & \R{1} & \R{1} & \R{1} & \R{1} & \R{1} \\
+Java    & 2             & 1             & 1             & 2             & 3             & 28    & -             & -             \\
+Python  & 3             & 5             & 6             & 7             & 23    & 13    & -             & -             \\
+\CC             & 4             & 3             & 3             & 3             & 2             & 2             & 2             & 8             \\
 \end{tabular}
 \end{center}
 …
 As stated, the goal of the \CFA project is to engineer modern language-features into C in an evolutionary rather than revolutionary way.
 \CC~\cite{C++14,C++} is an example of a similar project;
 however, it largely extended the C language, and did not address most of C's existing problems.\footnote{%
+however, it largely extended the C language, and did not address many of C's existing problems.\footnote{%
 Two important existing problems addressed were changing the type of character literals from ©int© to ©char© and enumerator from ©int© to the type of its enumerators.}
 \Index*{Fortran}~\cite{Fortran08}, \Index*{Ada}~\cite{Ada12}, and \Index*{Cobol}~\cite{Cobol14} are examples of programming languages that took an evolutionary approach, where modern language-features (\eg objects, concurrency) are added and problems fixed within the framework of the existing language.
 …
 The result of this project is a language that is largely backwards compatible with \Index*[C11]{\Celeven{}}~\cite{C11}, but fixes many of the well known C problems while adding modern language-features.
+To achieve these goals required a significant engineering exercise, where we had to ``think inside the existing C box''.
+Without these significant extension to C, it is unable to cope with the needs of modern programming problems and programmers;
+as a result, it will fade into disuse.
+Considering the large body of existing C code and programmers, there is significant impetus to ensure C is transformed into a modern programming language.
+To achieve these goals required a significant engineering exercise, \ie ``thinking \emph{inside} the C box''.
+Considering the large body of existing C code and programmers, there is significant impetus to ensure C is transformed into a modern language.
 While \Index*[C11]{\Celeven{}} made a few simple extensions to the language, nothing was added to address existing problems in the language or to augment the language with modern language-features.
 While some may argue that modern language-features may make C complex and inefficient, it is clear a language without modern capabilities is insufficient for the advanced programming problems existing today.
 …
 \section{History}
+The \CFA project started with \Index*{Dave Till}\index{Till, Dave}'s \Index*{K-W C}~\cite{Buhr94a,Till89}, which extended C with new declaration syntax, multiple return values from routines, and advanced assignment capabilities using the notion of tuples.
+(See~\cite{Werther96} for similar work in \Index*[C++]{\CC{}}.)
+The \CFA project started with \Index*{Dave Till}\index{Till, Dave}'s \Index*{K-W C}~\cite{Buhr94a,Till89}, which extended C with new declaration syntax, multiple return values from routines, and advanced assignment capabilities using the notion of tuples \see{\cite{Werther96} for similar work in \Index*[C++]{\CC{}}}.
 The first \CFA implementation of these extensions was by \Index*{Rodolfo Esteves}\index{Esteves, Rodolfo}~\cite{Esteves04}.
 The signature feature of \CFA is \emph{\Index{overload}able} \Index{parametric-polymorphic} functions~\cite{forceone:impl,Cormack90,Duggan96} with functions generalized using a ©forall© clause (giving the language its name):
 \begin{cfa}
 ®forall( otype T )® T identity( T val ) { return val; }
 int forty_two = identity( 42 ); §\C{// T is bound to int, forty\_two == 42}§
+@forall( otype T )@ T identity( T val ) { return val; }
+int forty_two = identity( 42 ); $\C{// T is bound to int, forty\_two == 42}$
 \end{cfa}
 % extending the C type system with parametric polymorphism and overloading, as opposed to the \Index*[C++]{\CC{}} approach of object-oriented extensions.
 \CFA{}\hspace{1pt}'s polymorphism was originally formalized by \Index*{Glen Ditchfield}\index{Ditchfield, Glen}~\cite{Ditchfield92}, and first implemented by \Index*{Richard Bilson}\index{Bilson, Richard}~\cite{Bilson03}.
 However, at that time, there was little interesting in extending C, so work did not continue.
 As the saying goes, ``\Index*{What goes around, comes around.}'', and there is now renewed interest in the C programming language because of legacy code-bases, so the \CFA project has been restarted.
+As the saying goes, ``\Index*{What goes around, comes around.}'', and there is now renewed interest in the C programming language because of the legacy code-base, so the \CFA project was restarted in 2015.
 …
 This feature allows \CFA programmers to take advantage of the existing panoply of C libraries to access thousands of external software features.
 Language developers often state that adequate \Index{library support} takes more work than designing and implementing the language itself.
 Fortunately, \CFA, like \Index*[C++]{\CC{}}, starts with immediate access to all exiting C libraries, and in many cases, can easily wrap library routines with simpler and safer interfaces, at very low cost.
+Fortunately, \CFA, like \Index*[C++]{\CC{}}, starts with immediate access to all exiting C libraries, and in many cases, can easily wrap library routines with simpler and safer interfaces, at zero or very low cost.
 Hence, \CFA begins by leveraging the large repository of C libraries, and than allows programmers to incrementally augment their C programs with modern \Index{backward-compatible} features.
 …
 double key = 5.0, vals[10] = { /* 10 sorted floating values */ };
 double * val = (double *)bsearch( &key, vals, 10, sizeof(vals[0]), comp ); §\C{// search sorted array}§
+double * val = (double *)bsearch( &key, vals, 10, sizeof(vals[0]), comp ); $\C{// search sorted array}$
 \end{cfa}
 which can be augmented simply with a polymorphic, type-safe, \CFA-overloaded wrappers:
 …
 forall( otype T | { int ?<?( T, T ); } ) unsigned int bsearch( T key, const T * arr, size_t size ) {
         T * result = bsearch( key, arr, size ); §\C{// call first version}§
         return result ? result - arr : size; } §\C{// pointer subtraction includes sizeof(T)}§
 double * val = bsearch( 5.0, vals, 10 ); §\C{// selection based on return type}§
+        T * result = bsearch( key, arr, size ); $\C{// call first version}$
+        return result ? result - arr : size; } $\C{// pointer subtraction includes sizeof(T)}$
+double * val = bsearch( 5.0, vals, 10 ); $\C{// selection based on return type}$
 int posn = bsearch( 5.0, vals, 10 );
 \end{cfa}
 …
 \begin{cfa}
 forall( dtype T | sized(T) ) T * malloc( void ) { return (T *)malloc( sizeof(T) ); }
 int * ip = malloc(); §\C{// select type and size from left-hand side}§
+int * ip = malloc(); $\C{// select type and size from left-hand side}$
 double * dp = malloc();
 struct S {...} * sp = malloc();
 …
 However, it is necessary to differentiate between C and \CFA code because of name \Index{overload}ing, as for \CC.
 For example, the C math-library provides the following routines for computing the absolute value of the basic types: ©abs©, ©labs©, ©llabs©, ©fabs©, ©fabsf©, ©fabsl©, ©cabsf©, ©cabs©, and ©cabsl©.
+Whereas, \CFA wraps each of these routines into ones with the overloaded name ©abs©:
+\begin{cfa}
+char ®abs®( char );
+extern "C" { int ®abs®( int ); } §\C{// use default C routine for int}§
+long int ®abs®( long int );
+long long int ®abs®( long long int );
+float ®abs®( float );
+double ®abs®( double );
+long double ®abs®( long double );
+float _Complex ®abs®( float _Complex );
+double _Complex ®abs®( double _Complex );
+long double _Complex ®abs®( long double _Complex );
+\end{cfa}
+The problem is the name clash between the library routine ©abs© and the \CFA names ©abs©.
+Hence, names appearing in an ©extern "C"© block have \newterm*{C linkage}.
+Then overloading polymorphism uses a mechanism called \newterm{name mangling}\index{mangling!name} to create unique names that are different from C names, which are not mangled.
+Hence, there is the same need, as in \CC, to know if a name is a C or \CFA name, so it can be correctly formed.
+There is no way around this problem, other than C's approach of creating unique names for each pairing of operation and types.
+This example strongly illustrates a core idea in \CFA: \emph{the \Index{power of a name}}.
+Whereas, \CFA wraps each of these routines into one overloaded name ©abs©:
+\begin{cfa}
+char @abs@( char );
+extern "C" { int @abs@( int ); } $\C{// use default C routine for int}$
+long int @abs@( long int );
+long long int @abs@( long long int );
+float @abs@( float );
+double @abs@( double );
+long double @abs@( long double );
+float _Complex @abs@( float _Complex );
+double _Complex @abs@( double _Complex );
+long double _Complex @abs@( long double _Complex );
+\end{cfa}
+The problem is \Index{name clash} between the C name ©abs© and the \CFA names ©abs©, resulting in two name linkages\index{C linkage}: ©extern "C"© and ©extern "Cforall"© (default).
+Overloaded names must use \newterm{name mangling}\index{mangling!name} to create unique names that are different from unmangled C names.
+Hence, there is the same need as in \CC to know if a name is a C or \CFA name, so it can be correctly formed.
+The only way around this problem is C's approach of creating unique names for each pairing of operation and type.
+This example illustrates a core idea in \CFA: \emph{the \Index{power of a name}}.
 The name ``©abs©'' evokes the notion of absolute value, and many mathematical types provide the notion of absolute value.
 Hence, knowing the name ©abs© is sufficient to apply it to any type where it is applicable.
 …
 \section[Compiling a CFA Program]{Compiling a \CFA Program}
+\section{\CFA Compilation}
 The command ©cfa© is used to compile a \CFA program and is based on the \Index{GNU} \Indexc{gcc} command, \eg:
 \begin{cfa}
+cfa§\indexc{cfa}\index{compilation!cfa@©cfa©}§ [ gcc-options ] [ C/§\CFA{}§ source-files ] [ assembler/loader files ]
+\end{cfa}
+\CFA programs having the following ©gcc© flags turned on:
+\begin{description}
+cfa$\indexc{cfa}\index{compilation!cfa@©cfa©}$ [ gcc/$\CFA{}$-options ] [ C/$\CFA{}$ source-files ] [ assembler/loader files ]
+\end{cfa}
+There is no ordering among options (flags) and files, unless an option has an argument, which must appear immediately after the option possibly with or without a space separating option and argument.
+\CFA has the following ©gcc© flags turned on:
+\begin{description}[topsep=0pt]
 \item
 \Indexc{-std=gnu11}\index{compilation option!-std=gnu11@{©-std=gnu11©}}
 …
 Use the traditional GNU semantics for inline routines in C11 mode, which allows inline routines in header files.
 \end{description}
+The following new \CFA options are available:
+\begin{description}
+\CFA has the following new options:
+\begin{description}[topsep=0pt]
 \item
 \Indexc{-CFA}\index{compilation option!-CFA@©-CFA©}
 Only the C preprocessor and the \CFA translator steps are performed and the transformed program is written to standard output, which makes it possible to examine the code generated by the \CFA translator.
+Only the C preprocessor (flag ©-E©) and the \CFA translator steps are performed and the transformed program is written to standard output, which makes it possible to examine the code generated by the \CFA translator.
 The generated code starts with the standard \CFA \Index{prelude}.
+\item
+\Indexc{-XCFA}\index{compilation option!-XCFA@©-XCFA©}
+Pass next flag as-is to the ©cfa-cpp© translator (see details below).
 \item
 \Indexc{-debug}\index{compilation option!-debug@©-debug©}
 The program is linked with the debugging version of the runtime system.
 The debug version performs runtime checks to help during the debugging phase of a \CFA program, but can substantially slow program execution.
+The debug version performs runtime checks to aid the debugging phase of a \CFA program, but can substantially slow program execution.
 The runtime checks should only be removed after the program is completely debugged.
 \textbf{This option is the default.}
 …
 \item
 \Indexc{-no-include-stdhdr}\index{compilation option!-no-include-stdhdr@©-no-include-stdhdr©}
 Do not supply ©extern "C"© wrappers for \Celeven standard include files (see~\VRef{s:StandardHeaders}).
+Do not supply ©extern "C"© wrappers for \Celeven standard include files \see{\VRef{s:StandardHeaders}}.
 \textbf{This option is \emph{not} the default.}
 \end{comment}
 …
 \begin{cfa}
 #ifndef __CFORALL__
 #include <stdio.h>§\indexc{stdio.h}§ §\C{// C header file}§
+#include <stdio.h>$\indexc{stdio.h}$ $\C{// C header file}$
 #else
 #include <fstream>§\indexc{fstream}§ §\C{// \CFA header file}§
+#include <fstream>$\indexc{fstream}$ $\C{// \CFA header file}$
 #endif
 \end{cfa}
 …
 The \CFA translator has multiple steps.
 The following flags control how the tranlator works, the stages run, and printing within a stage.
+The following flags control how the translator works, the stages run, and printing within a stage.
 The majority of these flags are used by \CFA developers, but some are occasionally useful to programmers.
+Each option must be escaped with \Indexc{-XCFA}\index{translator option!-XCFA@{©-XCFA©}} to direct it to the compiler step, similar to the ©-Xlinker© flag for the linker, \eg:
+\begin{lstlisting}[language=sh]
+cfa $test$.cfa -CFA -XCFA -p # print translated code without printing the standard prelude
+cfa $test$.cfa -XCFA -P -XCFA parse -XCFA -n # show program parse without prelude
+\end{lstlisting}
 \begin{description}[topsep=5pt,itemsep=0pt,parsep=0pt]
 \item
+\Indexc{-h}\index{translator option!-h@{©-h©}}, \Indexc{--help}\index{translator option!--help@{©--help©}} \, print help message
+\item
+\Indexc{-l}\index{translator option!-l@{©-l©}}, \Indexc{--libcfa}\index{translator option!--libcfa@{©--libcfa©}} \, generate libcfa.c
+\Indexc{-c}\index{translator option!-c@{©-c©}}, \Indexc{--colors}\index{translator option!--colors@{©--colors©}} \, diagnostic color: ©never©, ©always©, \lstinline[deletekeywords=auto]{auto}
+\item
+\Indexc{-g}\index{translator option!-g@{©-g©}}, \Indexc{--gdb}\index{translator option!--gdb@{©--gdb©}} \, wait for gdb to attach
+\item
+\Indexc{-h}\index{translator option!-h@{©-h©}}, \Indexc{--help}\index{translator option!--help@{©--help©}} \, print translator help message
+\item
+\Indexc{-l}\index{translator option!-l@{©-l©}}, \Indexc{--libcfa}\index{translator option!--libcfa@{©--libcfa©}} \, generate ©libcfa.c©
 \item
 \Indexc{-L}\index{translator option!-L@{©-L©}}, \Indexc{--linemarks}\index{translator option!--linemarks@{©--linemarks©}} \, generate line marks
 …
 \Indexc{-n}\index{translator option!-n@{©-n©}}, \Indexc{--no-prelude}\index{translator option!--no-prelude@{©--no-prelude©}} \, do not read prelude
 \item
+\Indexc{-p}\index{translator option!-p@{©-p©}}, \Indexc{--prototypes}\index{translator option!--prototypes@{©--prototypes©}} \, generate prototypes for prelude functions
+\Indexc{-p}\index{translator option!-p@{©-p©}}, \Indexc{--prototypes}\index{translator option!--prototypes@{©--prototypes©}} \, do not generate prelude prototypes $\Rightarrow$ prelude not printed
+\item
+\Indexc{-d}\index{translator option!-d@{©-d©}}, \Indexc{--deterministic-out}\index{translator option!--deterministic-out@{©--deterministic-out©}} \, only print deterministic output
 \item
 \Indexc{-P}\index{translator option!-P@{©-P©}}, \Indexc{--print}\index{translator option!--print@{©--print©}} \, one of:
 \begin{description}[topsep=0pt,itemsep=0pt,parsep=0pt]
 \item
+\Indexc{ascodegen}\index{translator option!-P@{©-P©}!©ascodegen©}\index{translator option!--print@{©-print©}!©ascodegen©} \, as codegen rather than AST
+\item
+\Indexc{asterr}\index{translator option!-P@{©-P©}!©asterr©}\index{translator option!--print@{©-print©}!©asterr©} \, AST on error
+\item
+\Indexc{declstats}\index{translator option!-P@{©-P©}!©declstats©}\index{translator option!--print@{©-print©}!©declstats©} \, code property statistics
+\item
+\Indexc{parse}\index{translator option!-P@{©-P©}!©parse©}\index{translator option!--print@{©-print©}!©parse©} \, yacc (parsing) debug information
+\item
+\Indexc{pretty}\index{translator option!-P@{©-P©}!©pretty©}\index{translator option!--print@{©-print©}!©pretty©} \, prettyprint for ©ascodegen© flag
+\item
+\Indexc{rproto}\index{translator option!-P@{©-P©}!©rproto©}\index{translator option!--print@{©-print©}!©rproto©} \, resolver-proto instance
+\item
+\Indexc{rsteps}\index{translator option!-P@{©-P©}!©rsteps©}\index{translator option!--print@{©-print©}!©rsteps©} \, resolver steps
+\item
+\Indexc{tree}\index{translator option!-P@{©-P©}!©tree©}\index{translator option!--print@{©-print©}!©tree©} \, parse tree
+\item
+\Indexc{ast}\index{translator option!-P@{©-P©}!©ast©}\index{translator option!--print@{©-print©}!©ast©} \, AST after parsing
+\item
+\Indexc{symevt}\index{translator option!-P@{©-P©}!©symevt©}\index{translator option!--print@{©-print©}!©symevt©} \, symbol table events
+\item
 \Indexc{altexpr}\index{translator option!-P@{©-P©}!©altexpr©}\index{translator option!--print@{©-print©}!©altexpr©} \, alternatives for expressions
 \item
-\Indexc{ascodegen}\index{translator option!-P@{©-P©}!©ascodegen©}\index{translator option!--print@{©-print©}!©ascodegen©} \, as codegen rather than AST
-\item
-\Indexc{ast}\index{translator option!-P@{©-P©}!©ast©}\index{translator option!--print@{©-print©}!©ast©} \, AST after parsing
-\item
 \Indexc{astdecl}\index{translator option!-P@{©-P©}!©astdecl©}\index{translator option!--print@{©-print©}!©astdecl©} \, AST after declaration validation pass
 \item
 \Indexc{asterr}\index{translator option!-P@{©-P©}!©asterr©}\index{translator option!--print@{©-print©}!©asterr©} \, AST on error
+\Indexc{resolver}\index{translator option!-P@{©-P©}!©resolver©}\index{translator option!--print@{©-print©}!©resolver©} \, before resolver step
 \item
 \Indexc{astexpr}\index{translator option!-P@{©-P©}!©astexpr©}\index{translator option!--print@{©-print©}!©altexpr©} \, AST after expression analysis
 \item
+\Indexc{ctordtor}\index{translator option!-P@{©-P©}!©ctordtor©}\index{translator option!--print@{©-print©}!©ctordtor©} \, after ctor/dtor are replaced
+\item
+\Indexc{tuple}\index{translator option!-P@{©-P©}!©tuple©}\index{translator option!--print@{©-print©}!©tuple©} \, after tuple expansion
+\item
 \Indexc{astgen}\index{translator option!-P@{©-P©}!©astgen©}\index{translator option!--print@{©-print©}!©astgen©} \, AST after instantiate generics
 \item
 \Indexc{box}\index{translator option!-P@{©-P©}!©box©}\index{translator option!--print@{©-print©}!©box©} \, before box step
 \item
-\Indexc{ctordtor}\index{translator option!-P@{©-P©}!©ctordtor©}\index{translator option!--print@{©-print©}!©ctordtor©} \, after ctor/dtor are replaced
-\item
 \Indexc{codegen}\index{translator option!-P@{©-P©}!©codegen©}\index{translator option!--print@{©-print©}!©codegen©} \, before code generation
-\item
-\Indexc{declstats}\index{translator option!-P@{©-P©}!©declstats©}\index{translator option!--print@{©-print©}!©declstats©} \, code property statistics
-\item
-\Indexc{parse}\index{translator option!-P@{©-P©}!©parse©}\index{translator option!--print@{©-print©}!©parse©} \, yacc (parsing) debug information
-\item
-\Indexc{pretty}\index{translator option!-P@{©-P©}!©pretty©}\index{translator option!--print@{©-print©}!©pretty©} \, prettyprint for ascodegen flag
-\item
-\Indexc{resolver}\index{translator option!-P@{©-P©}!©resolver©}\index{translator option!--print@{©-print©}!©resolver©} \, before resolver step
-\item
-\Indexc{rproto}\index{translator option!-P@{©-P©}!©rproto©}\index{translator option!--print@{©-print©}!©rproto©} \, resolver-proto instance
-\item
-\Indexc{rsteps}\index{translator option!-P@{©-P©}!©rsteps©}\index{translator option!--print@{©-print©}!©rsteps©} \, resolver steps
-\item
-\Indexc{symevt}\index{translator option!-P@{©-P©}!©symevt©}\index{translator option!--print@{©-print©}!©symevt©} \, symbol table events
-\item
-\Indexc{tree}\index{translator option!-P@{©-P©}!©tree©}\index{translator option!--print@{©-print©}!©tree©} \, parse tree
-\item
-\Indexc{tuple}\index{translator option!-P@{©-P©}!©tuple©}\index{translator option!--print@{©-print©}!©tuple©} \, after tuple expansion
 \end{description}
 \item
 \Indexc{--prelude-dir} <directory> \, prelude directory for debug/nodebug
 \item
+\Indexc{-S}\index{translator option!-S@{©-S©}!©counters,heap,time,all,none©}, \Indexc{--statistics}\index{translator option!--statistics@{©--statistics©}!©counters,heap,time,all,none©} <option-list> \, enable profiling information:
+\begin{description}[topsep=0pt,itemsep=0pt,parsep=0pt]
+\item
+\Indexc{counters,heap,time,all,none}
+\end{description}
+\Indexc{-S}\index{translator option!-S@{©-S©}!©counters,heap,time,all,none©}, \Indexc{--statistics}\index{translator option!--statistics@{©--statistics©}!©counters,heap,time,all,none©} <option-list> \, enable profiling information: ©counters©, ©heap©, ©time©, ©all©, ©none©
 \item
 \Indexc{-t}\index{translator option!-t@{©-t©}}, \Indexc{--tree}\index{translator option!--tree@{©--tree©}} build in tree
 …
 \label{s:BackquoteIdentifiers}
 \CFA introduces several new keywords (see \VRef{s:CFAKeywords}) that can clash with existing C variable-names in legacy code.
+\CFA introduces several new keywords \see{\VRef{s:CFAKeywords}} that can clash with existing C variable-names in legacy code.
 Keyword clashes are accommodated by syntactic transformations using the \CFA backquote escape-mechanism:
 \begin{cfa}
 int ®``®otype = 3; §\C{// make keyword an identifier}§
 double ®``®forall = 3.5;
+int @``@otype = 3; $\C{// make keyword an identifier}$
+double @``@forall = 3.5;
 \end{cfa}
 Existing C programs with keyword clashes can be converted by enclosing keyword identifiers in backquotes, and eventually the identifier name can be changed to a non-keyword name.
 \VRef[Figure]{f:HeaderFileInterposition} shows how clashes in existing C header-files (see~\VRef{s:StandardHeaders}) can be handled using preprocessor \newterm{interposition}: ©#include_next© and ©-I filename©.
+\VRef[Figure]{f:HeaderFileInterposition} shows how clashes in existing C header-files \see{\VRef{s:StandardHeaders}} can be handled using preprocessor \newterm{interposition}: ©#include_next© and ©-I filename©.
 Several common C header-files with keyword clashes are fixed in the standard \CFA header-library, so there is a seamless programming-experience.
 …
 \begin{cfa}
 // include file uses the CFA keyword "with".
 #if ! defined( with ) §\C{// nesting ?}§
 #define with ®``®with §\C{// make keyword an identifier}§
+#if ! defined( with )                                                   $\C{// nesting ?}$
+#define with @``@with                                                   $\C{// make keyword an identifier}$
 #define __CFA_BFD_H__
 #endif
+§{\color{red}\#\textbf{include\_next} <bfdlink.h>}§ §\C{// must have internal check for multiple expansion}§
 #if defined( with ) && defined( __CFA_BFD_H__ ) §\C{// reset only if set}§
+$\R{\#include\_next} <bfdlink.h>$                               $\C{// must have internal check for multiple expansion}$
+#if defined( with ) && defined( __CFA_BFD_H__ ) $\C{// reset only if set}$
 #undef with
 #undef __CFA_BFD_H__
 …
 \section{Constant Underscores}
 Numeric constants are extended to allow \Index{underscore}s\index{constant!underscore}, \eg:
 \begin{cfa}
 ®_®147®_®483®_®648; §\C{// decimal constant}§
 ®_®ul; §\C{// decimal unsigned long constant}§
 ®_®377; §\C{// octal constant}§
 x®_®ff®_®ff; §\C{// hexadecimal constant}§
 x®_®ef3d®_®aa5c; §\C{// hexadecimal constant}§
 .141®_®592®_®654; §\C{// floating constant}§
 ®_®e®_®+1®_®00; §\C{// floating constant}§
 x®_®ff®_®ff®_®p®_®3; §\C{// hexadecimal floating}§
 x®_®1.ffff®_®ffff®_®p®_®128®_®l; §\C{// hexadecimal floating long constant}§
 L®_®§"\texttt{\textbackslash{x}}§®_®§\texttt{ff}§®_®§\texttt{ee}"§; §\C{// wide character constant}§
+Numeric constants are extended to allow \Index{underscore}s\index{constant!underscore} as a separator, \eg:
+\begin{cfa}
+@_@147@_@483@_@648; $\C{// decimal constant}$
+@_@ul; $\C{// decimal unsigned long constant}$
+@_@377; $\C{// octal constant}$
+x@_@ff@_@ff; $\C{// hexadecimal constant}$
+x@_@ef3d@_@aa5c; $\C{// hexadecimal constant}$
+.141@_@592@_@654; $\C{// floating constant}$
+@_@e@_@+1@_@00; $\C{// floating constant}$
+x@_@ff@_@ff@_@p@_@3; $\C{// hexadecimal floating}$
+x@_@1.ffff@_@ffff@_@p@_@128@_@l; $\C{// hexadecimal floating long constant}$
+L@_@$"\texttt{\textbackslash{x}}$@_@$\texttt{ff}$@_@$\texttt{ee}"$; $\C{// wide character constant}$
 \end{cfa}
 The rules for placement of underscores are:
 …
 It is significantly easier to read and enter long constants when they are broken up into smaller groupings (many cultures use comma and/or period among digits for the same purpose).
 This extension is backwards compatible, matches with the use of underscore in variable names, and appears in \Index*{Ada} and \Index*{Java} 8.
+\CC uses the single quote (©'©) as a separator, restricted within a sequence of digits, \eg ©0xaa©©'©©ff©, ©3.141©©'©©592E1©©'©©1©.
 \section{Exponentiation Operator}
 C, \CC, and Java (and many other programming languages) have no exponentiation operator\index{exponentiation!operator}\index{operator!exponentiation}, \ie $x^y$, and instead use a routine, like \Indexc{pow(x,y)}, to perform the exponentiation operation.
 \CFA extends the basic operators with the exponentiation operator ©?®\®?©\index{?\\?@©?®\®?©} and ©?\=?©\index{?\\=?@©®\®=?©}, as in, ©x ®\® y© and ©x ®\®= y©, which means $x^y$ and $x \leftarrow x^y$.
 The priority of the exponentiation operator is between the cast and multiplicative operators, so that ©w * (int)x \ (int)y * z© is parenthesized as ©((w * (((int)x) \ ((int)y))) * z)©.
+C, \CC, and Java (and other programming languages) have no exponentiation operator\index{exponentiation!operator}\index{operator!exponentiation}, \ie $x^y$, and instead use a routine, like \Indexc{pow(x,y)}, to perform the exponentiation operation.
+\CFA extends the basic operators with the exponentiation operator ©?©\R{©\\©}©?©\index{?\\?@©?@\@?©} and ©?©\R{©\\©}©=?©\index{?\\=?@©@\@=?©}, as in, ©x ©\R{©\\©}© y© and ©x ©\R{©\\©}©= y©, which means $x^y$ and $x \leftarrow x^y$.
+The priority of the exponentiation operator is between the cast and multiplicative operators, so that ©w * (int)x \ (int)y * z© is parenthesized as ©(w * (((int)x) \ ((int)y))) * z©.
 There are exponentiation operators for integral and floating types, including the builtin \Index{complex} types.
 …
 Floating exponentiation\index{exponentiation!floating} is performed using \Index{logarithm}s\index{exponentiation!logarithm}, so the exponent cannot be negative.
 \begin{cfa}
 sout | 1 ®\® 0 | 1 ®\® 1 | 2 ®\® 8 | -4 ®\® 3 | 5 ®\® 3 | 5 ®\® 32 | 5L ®\® 32 | 5L ®\® 64 | -4 ®\® -3 | -4.0 ®\® -3 | 4.0 ®\® 2.1
            | (1.0f+2.0fi) ®\® (3.0f+2.0fi);
 1 256 -64 125 ®0® 3273344365508751233 ®0® ®0® -0.015625 18.3791736799526 0.264715-1.1922i
+sout | 1 @\@ 0 | 1 @\@ 1 | 2 @\@ 8 | -4 @\@ 3 | 5 @\@ 3 | 5 @\@ 32 | 5L @\@ 32 | 5L @\@ 64 | -4 @\@ -3 | -4.0 @\@ -3 | 4.0 @\@ 2.1
+           | (1.0f+2.0fi) @\@ (3.0f+2.0fi);
+1 256 -64 125 @0@ 3273344365508751233 @0@ @0@ -0.015625 18.3791736799526 0.264715-1.1922i
 \end{cfa}
 Note, ©5 \ 32© and ©5L \ 64© overflow, and ©-4 \ -3© is a fraction but stored in an integer so all three computations generate an integral zero.
+Parenthesis are necessary for complex constants or the expression is parsed as ©1.0f+®(®2.0fi \ 3.0f®)®+2.0fi©.
+Because exponentiation has higher priority than ©+©, parenthesis are necessary for exponentiation of \Index{complex constant}s or the expression is parsed as ©1.0f+©\R{©(©}©2.0fi \ 3.0f©\R{©)©}©+2.0fi©, requiring \R{©(©}©1.0f+2.0fi©\R{©)©}© \ ©\R{©(©}©3.0f+2.0fi©\R{©)©}.
 The exponentiation operator is available for all the basic types, but for user-defined types, only the integral-computation version is available.
 \begin{cfa}
 forall( otype OT | { void ?{}( OT & this, one_t ); OT ?*?( OT, OT ); } )
 OT ?®\®?( OT ep, unsigned int y );
 forall( otype OT | { void ?{}( OT & this, one_t ); OT ?*?( OT, OT ); } )
 OT ?®\®?( OT ep, unsigned long int y );
+forall( otype T | { void ?{}( T & this, one_t ); T ?*?( T, T ); } )
+T ?@\@?( T ep, unsigned int y );
+forall( otype T | { void ?{}( T & this, one_t ); T ?*?( T, T ); } )
+T ?@\@?( T ep, unsigned long int y );
 \end{cfa}
 The user type ©T© must define multiplication, one (©1©), and ©*©.
 …
 %\subsection{\texorpdfstring{\protect\lstinline@if@/\protect\lstinline@while@ Statement}{if Statement}}
+\subsection{\texorpdfstring{\LstKeywordStyle{if}/\LstKeywordStyle{while} Statement}{if/while Statement}}
+The ©if©/©while© expression allows declarations, similar to ©for© declaration expression.
+(Does not make sense for ©do©-©while©.)
+\begin{cfa}
+if ( ®int x = f()® ) ... §\C{// x != 0}§
+if ( ®int x = f(), y = g()® ) ... §\C{// x != 0 \&\& y != 0}§
+if ( ®int x = f(), y = g(); x < y® ) ... §\C{// relational expression}§
+if ( ®struct S { int i; } x = { f() }; x.i < 4® ) §\C{// relational expression}§
+while ( ®int x = f()® ) ... §\C{// x != 0}§
+while ( ®int x = f(), y = g()® ) ... §\C{// x != 0 \&\& y != 0}§
+while ( ®int x = f(), y = g(); x < y® ) ... §\C{// relational expression}§
+while ( ®struct S { int i; } x = { f() }; x.i < 4® ) ... §\C{// relational expression}§
+\end{cfa}
+Unless a relational expression is specified, each variable is compared not equal to 0, which is the standard semantics for the ©if©/©while© expression, and the results are combined using the logical ©&&© operator.\footnote{\CC only provides a single declaration always compared not equal to 0.}
+The scope of the declaration(s) is local to the @if@ statement but exist within both the ``then'' and ``else'' clauses.
+\subsection{\texorpdfstring{\LstKeywordStyle{if} / \LstKeywordStyle{while} Statement}{if / while Statement}}
+The ©if©/©while© expression allows declarations, similar to ©for© declaration expression.\footnote{
+Declarations in the ©do©-©while© condition are not useful because they appear after the loop body.}
+\begin{cfa}
+if ( @int x = f()@ ) ... $\C{// x != 0}$
+if ( @int x = f(), y = g()@ ) ... $\C{// x != 0 \&\& y != 0}$
+if ( @int x = f(), y = g(); x < y@ ) ... $\C{// relational expression}$
+if ( @struct S { int i; } x = { f() }; x.i < 4@ ) $\C{// relational expression}$
+while ( @int x = f()@ ) ... $\C{// x != 0}$
+while ( @int x = f(), y = g()@ ) ... $\C{// x != 0 \&\& y != 0}$
+while ( @int x = f(), y = g(); x < y@ ) ... $\C{// relational expression}$
+while ( @struct S { int i; } x = { f() }; x.i < 4@ ) ... $\C{// relational expression}$
+\end{cfa}
+Unless a relational expression is specified, each variable is compared not equal to 0, which is the standard semantics for the ©if©/©while© expression, and the results are combined using the logical ©&&© operator.
+The scope of the declaration(s) is local to the ©if© statement but exist within both the \emph{then} and \emph{else} clauses.
+\CC only provides a single declaration always compared ©!=© to 0.
 %\section{\texorpdfstring{\protect\lstinline@case@ Clause}{case Clause}}
 \subsection{\texorpdfstring{\LstKeywordStyle{case} Clause}{case Clause}}
+\label{s:caseClause}
 C restricts the ©case© clause of a ©switch© statement to a single value.
 …
 \begin{cfa}
 switch ( i ) {
   case ®1, 3, 5®:
+  case @1, 3, 5@:
         ...
   case ®2, 4, 6®:
+  case @2, 4, 6@:
         ...
+}
 …
 \begin{cfa}
 switch ( i ) {
   case ®1~5:® §\C{// 1, 2, 3, 4, 5}§
+  case @1~5:@ $\C{// 1, 2, 3, 4, 5}$
         ...
   case ®10~15:® §\C{// 10, 11, 12, 13, 14, 15}§
+  case @10~15:@ $\C{// 10, 11, 12, 13, 14, 15}$
         ...
+}
 …
 Lists of subranges are also allowed.
 \begin{cfa}
 case ®1~5, 12~21, 35~42®:
+case @1~5, 12~21, 35~42@:
 \end{cfa}
 …
 if ( argc == 3 ) {
         // open output file
         ®// open input file
 ®} else if ( argc == 2 ) {
         ®// open input file (duplicate)
 ®} else {
+        @// open input file
+@} else if ( argc == 2 ) {
+        @// open input file (duplicate)
+@} else {
         // usage message
+}
 …
 \end{cquote}
 In this example, case 2 is always done if case 3 is done.
 This control flow is difficult to simulate with if statements or a ©switch© statement without fall-through as code must be duplicated or placed in a separate routine.
+This control flow is difficult to simulate with ©if© statements or a ©switch© statement without fall-through as code must be duplicated or placed in a separate routine.
 C also uses fall-through to handle multiple case-values resulting in the same action:
 \begin{cfa}
 switch ( i ) {
   ®case 1: case 3: case 5:®     // odd values
+  @case 1: case 3: case 5:@     // odd values
         // odd action
         break;
   ®case 2: case 4: case 6:®     // even values
+  @case 2: case 4: case 6:@     // even values
         // even action
         break;
+}
 \end{cfa}
 However, this situation is handled in other languages without fall-through by allowing a list of case values.
 While fall-through itself is not a problem, the problem occurs when fall-through is the default, as this semantics is unintuitive to many programmers and is different from virtually all other programming languages with a ©switch© statement.
+This situation better handled without fall-through by allowing a list of case values \see{\VRef{s:caseClause}}.
+While fall-through itself is not a problem, the problem occurs when fall-through is the default, as this semantics is unintuitive to many programmers and is different from most programming languages with a ©switch© statement.
 Hence, default fall-through semantics results in a large number of programming errors as programmers often \emph{forget} the ©break© statement at the end of a ©case© clause, resulting in inadvertent fall-through.
 …
         if ( j < k ) {
                 ...
           ®case 1:®             // transfer into "if" statement
+          @case 1:@             // transfer into "if" statement
                 ...
         } // if
 …
         while ( j < 5 ) {
                 ...
           ®case 3:®             // transfer into "while" statement
+          @case 3:@             // transfer into "while" statement
                 ...
         } // while
 } // switch
 \end{cfa}
 The problem with this usage is branching into control structures, which is known to cause both comprehension and technical difficulties.
 The comprehension problem occurs from the inability to determine how control reaches a particular point due to the number of branches leading to it.
+This usage branches into control structures, which is known to cause both comprehension and technical difficulties.
+The comprehension problem results from the inability to determine how control reaches a particular point due to the number of branches leading to it.
 The technical problem results from the inability to ensure declaration and initialization of variables when blocks are not entered at the beginning.
 There are no positive arguments for this kind of control flow, and therefore, there is a strong impetus to eliminate it.
+There are few arguments for this kind of control flow, and therefore, there is a strong impetus to eliminate it.
 Nevertheless, C does have an idiom where this capability is used, known as ``\Index*{Duff's device}''~\cite{Duff83}:
 \begin{cfa}
 …
 \item
 It is possible to place the ©default© clause anywhere in the list of labelled clauses for a ©switch© statement, rather than only at the end.
 Virtually all programming languages with a ©switch© statement require the ©default© clause to appear last in the case-clause list.
+Most programming languages with a ©switch© statement require the ©default© clause to appear last in the case-clause list.
 The logic for this semantics is that after checking all the ©case© clauses without success, the ©default© clause is selected;
 hence, physically placing the ©default© clause at the end of the ©case© clause list matches with this semantics.
 …
 \begin{cfa}
 switch ( x ) {
         ®int y = 1;® §\C{// unreachable initialization}§
         ®x = 7;® §\C{// unreachable code without label/branch}§
+        @int y = 1;@ $\C{// unreachable initialization}$
+        @x = 7;@ $\C{// unreachable code without label/branch}$
   case 0: ...
         ...
         ®int z = 0;® §\C{// unreachable initialization, cannot appear after case}§
+        @int z = 0;@ $\C{// unreachable initialization, cannot appear after case}$
         z = 2;
   case 1:
         ®x = z;® §\C{// without fall through, z is uninitialized}§
+        @x = z;@ $\C{// without fall through, z is uninitialized}$
+}
 \end{cfa}
 While the declaration of the local variable ©y© is useful with a scope across all ©case© clauses, the initialization for such a variable is defined to never be executed because control always transfers over it.
 Furthermore, any statements before the first ©case© clause can only be executed if labelled and transferred to using a ©goto©, either from outside or inside of the ©switch©, both of which are problematic.
 As well, the declaration of ©z© cannot occur after the ©case© because a label can only be attached to a statement, and without a fall through to case 3, ©z© is uninitialized.
 The key observation is that the ©switch© statement branches into control structure, \ie there are multiple entry points into its statement body.
+Furthermore, any statements before the first ©case© clause can only be executed if labelled and transferred to using a ©goto©, either from outside or inside of the ©switch©, where both are problematic.
+As well, the declaration of ©z© cannot occur after the ©case© because a label can only be attached to a statement, and without a fall-through to case 3, ©z© is uninitialized.
+The key observation is that the ©switch© statement branches into a control structure, \ie there are multiple entry points into its statement body.
 \end{enumerate}
 …
 Therefore, to preserve backwards compatibility, it is necessary to introduce a new kind of ©switch© statement, called ©choose©, with no implicit fall-through semantics and an explicit fall-through if the last statement of a case-clause ends with the new keyword ©fallthrough©/©fallthru©, \eg:
 \begin{cfa}
 ®choose® ( i ) {
+@choose@ ( i ) {
   case 1:  case 2:  case 3:
         ...
         ®// implicit end of switch (break)
   ®case 5:
+        @// implicit end of switch (break)
+  @case 5:
         ...
         ®fallthru®; §\C{// explicit fall through}§
+        @fallthru@; $\C{// explicit fall through}$
   case 7:
         ...
         ®break® §\C{// explicit end of switch (redundant)}§
+        @break@ $\C{// explicit end of switch (redundant)}$
   default:
         j = 3;
+}
 \end{cfa}
 Like the ©switch© statement, the ©choose© statement retains the fall-through semantics for a list of ©case© clauses;
+Like the ©switch© statement, the ©choose© statement retains the fall-through semantics for a list of ©case© clauses.
 An implicit ©break© is applied only at the end of the \emph{statements} following a ©case© clause.
 An explicit ©fallthru© is retained because it is a C-idiom most C programmers expect, and its absence might discourage programmers from using the ©choose© statement.
 …
 \begin{cfa}
 switch ( x ) {
         ®int i = 0;® §\C{// allowed only at start}§
+        @int i = 0;@ $\C{// allowed only at start}$
   case 0:
         ...
         ®int j = 0;® §\C{// disallowed}§
+        @int j = 0;@ $\C{// disallowed}$
   case 1:
+        {
                 ®int k = 0;® §\C{// allowed at different nesting levels}§
+                @int k = 0;@ $\C{// allowed at different nesting levels}$
                 ...
           ®case 2:® §\C{// disallow case in nested statements}§
+          @case 2:@ $\C{// disallow case in nested statements}$
+        }
   ...
 …
   case 3:
         if ( ... ) {
                 ... ®fallthru;® // goto case 4
+                ... @fallthru;@ // goto case 4
         } else {
                 ...
 …
 choose ( ... ) {
   case 3:
         ... ®fallthrough common;®
+        ... @fallthrough common;@
   case 4:
         ... ®fallthrough common;®
   ®common:® // below fallthrough
+        ... @fallthrough common;@
+  @common:@ // below fallthrough
                           // at case-clause level
         ...     // common code for cases 3/4
 …
                 for ( ... ) {
                         // multi-level transfer
                         ... ®fallthru common;®
+                        ... @fallthru common;@
+                }
                 ...
+        }
         ...
   ®common:® // below fallthrough
+  @common:@ // below fallthrough
                           // at case-clause level
 \end{cfa}
 …
 \begin{figure}
 \begin{tabular}{@{}l|l@{}}
 \multicolumn{1}{c|}{loop control} & \multicolumn{1}{c}{output} \\
+\begin{tabular}{@{}l@{\hspace{25pt}}|l@{}}
+\multicolumn{1}{@{}c@{\hspace{25pt}}|}{loop control} & \multicolumn{1}{c@{}}{output} \\
 \hline
 \begin{cfa}[xleftmargin=0pt]
 while ®()® { sout | "empty"; break; }
 do { sout | "empty"; break; } while ®()®;
 for ®()® { sout | "empty"; break; }
 for ( ®0® ) { sout | "A"; } sout | "zero";
 for ( ®1® ) { sout | "A"; }
 for ( ®10® ) { sout | "A"; }
 for ( ®= 10® ) { sout | "A"; }
 for ( ®1 ~= 10 ~ 2® ) { sout | "B"; }
 for ( ®10 -~= 1 ~ 2® ) { sout | "C"; }
 for ( ®0.5 ~ 5.5® ) { sout | "D"; }
 for ( ®5.5 -~ 0.5® ) { sout | "E"; }
 for ( ®i; 10® ) { sout | i; }
 for ( ®i; = 10® ) { sout | i; }
 for ( ®i; 1 ~= 10 ~ 2® ) { sout | i; }
 for ( ®i; 10 -~= 1 ~ 2® ) { sout | i; }
 for ( ®i; 0.5 ~ 5.5® ) { sout | i; }
 for ( ®i; 5.5 -~ 0.5® ) { sout | i; }
 for ( ®ui; 2u ~= 10u ~ 2u® ) { sout | ui; }
 for ( ®ui; 10u -~= 2u ~ 2u® ) { sout | ui; }
+\begin{cfa}
+while @($\,$)@ { sout | "empty"; break; }
+do { sout | "empty"; break; } while @($\,$)@;
+for @($\,$)@ { sout | "empty"; break; }
+for ( @0@ ) { sout | "A"; } sout | "zero";
+for ( @1@ ) { sout | "A"; }
+for ( @10@ ) { sout | "A"; }
+for ( @= 10@ ) { sout | "A"; }
+for ( @1 ~= 10 ~ 2@ ) { sout | "B"; }
+for ( @10 -~= 1 ~ 2@ ) { sout | "C"; }
+for ( @0.5 ~ 5.5@ ) { sout | "D"; }
+for ( @5.5 -~ 0.5@ ) { sout | "E"; }
+for ( @i; 10@ ) { sout | i; }
+for ( @i; = 10@ ) { sout | i; }
+for ( @i; 1 ~= 10 ~ 2@ ) { sout | i; }
+for ( @i; 10 -~= 1 ~ 2@ ) { sout | i; }
+for ( @i; 0.5 ~ 5.5@ ) { sout | i; }
+for ( @i; 5.5 -~ 0.5@ ) { sout | i; }
+for ( @ui; 2u ~= 10u ~ 2u@ ) { sout | ui; }
+for ( @ui; 10u -~= 2u ~ 2u@ ) { sout | ui; }
 enum { N = 10 };
 for ( ®N® ) { sout | "N"; }
 for ( ®i; N® ) { sout | i; }
 for ( ®i; N -~ 0® ) { sout | i; }
+for ( @N@ ) { sout | "N"; }
+for ( @i; N@ ) { sout | i; }
+for ( @i; N -~ 0@ ) { sout | i; }
 const int start = 3, comp = 10, inc = 2;
 for ( ®i; start ~ comp ~ inc + 1® ) { sout | i; }
 for ( i; 1 ~ ®@® ) { if ( i > 10 ) break; sout | i; }
 for ( i; 10 -~ ®@® ) { if ( i < 0 ) break; sout | i; }
 for ( i; 2 ~ ®@® ~ 2 ) { if ( i > 10 ) break; sout | i; }
 for ( i; 2.1 ~ ®@® ~ ®@® ) { if ( i > 10.5 ) break; sout | i; i += 1.7; }
 for ( i; 10 -~ ®@® ~ 2 ) { if ( i < 0 ) break; sout | i; }
 for ( i; 12.1 ~ ®@® ~ ®@® ) { if ( i < 2.5 ) break; sout | i; i -= 1.7; }
 for ( i; 5 ®:® j; -5 ~ @ ) { sout | i | j; }
 for ( i; 5 ®:® j; -5 -~ @ ) { sout | i | j; }
 for ( i; 5 ®:® j; -5 ~ @ ~ 2 ) { sout | i | j; }
 for ( i; 5 ®:® j; -5 -~ @ ~ 2 ) { sout | i | j; }
 for ( i; 5 ®:® j; -5 ~ @ ) { sout | i | j; }
 for ( i; 5 ®:® j; -5 -~ @ ) { sout | i | j; }
 for ( i; 5 ®:® j; -5 ~ @ ~ 2 ) { sout | i | j; }
 for ( i; 5 ®:® j; -5 -~ @ ~ 2 ) { sout | i | j; }
 for ( i; 5 ®:® j; -5 -~ @ ~ 2 ®:® k; 1.5 ~ @ ) { sout | i | j | k; }
 for ( i; 5 ®:® j; -5 -~ @ ~ 2 ®:® k; 1.5 ~ @ ) { sout | i | j | k; }
 for ( i; 5 ®:® k; 1.5 ~ @ ®:® j; -5 -~ @ ~ 2 ) { sout | i | j | k; }
+for ( @i; start ~ comp ~ inc + 1@ ) { sout | i; }
+for ( i; 1 ~ $\R{@}$ ) { if ( i > 10 ) break; sout | i; }
+for ( i; 10 -~ $\R{@}$ ) { if ( i < 0 ) break; sout | i; }
+for ( i; 2 ~ $\R{@}$ ~ 2 ) { if ( i > 10 ) break; sout | i; }
+for ( i; 2.1 ~ $\R{@}$ ~ $\R{@}$ ) { if ( i > 10.5 ) break; sout | i; i += 1.7; }
+for ( i; 10 -~ $\R{@}$ ~ 2 ) { if ( i < 0 ) break; sout | i; }
+for ( i; 12.1 ~ $\R{@}$ ~ $\R{@}$ ) { if ( i < 2.5 ) break; sout | i; i -= 1.7; }
+for ( i; 5 @:@ j; -5 ~ $@$ ) { sout | i | j; }
+for ( i; 5 @:@ j; -5 -~ $@$ ) { sout | i | j; }
+for ( i; 5 @:@ j; -5 ~ $@$ ~ 2 ) { sout | i | j; }
+for ( i; 5 @:@ j; -5 -~ $@$ ~ 2 ) { sout | i | j; }
+for ( i; 5 @:@ j; -5 ~ $@$ ) { sout | i | j; }
+for ( i; 5 @:@ j; -5 -~ $@$ ) { sout | i | j; }
+for ( i; 5 @:@ j; -5 ~ $@$ ~ 2 ) { sout | i | j; }
+for ( i; 5 @:@ j; -5 -~ $@$ ~ 2 ) { sout | i | j; }
+for ( i; 5 @:@ j; -5 -~ $@$ ~ 2 @:@ k; 1.5 ~ $@$ ) { sout | i | j | k; }
+for ( i; 5 @:@ j; -5 -~ $@$ ~ 2 @:@ k; 1.5 ~ $@$ ) { sout | i | j | k; }
+for ( i; 5 @:@ k; 1.5 ~ $@$ @:@ j; -5 -~ $@$ ~ 2 ) { sout | i | j | k; }
 \end{cfa}
+&
 …
 \subsection{Loop Control}
+The ©for©/©while©/©do-while© loop-control allows empty or simplified ranges (see Figure~\ref{f:LoopControlExamples}).
+\begin{itemize}
+Looping a fixed number of times, possibly with a loop index, occurs frequently.
+\CFA condenses simply looping to facilitate coding speed and safety.
+The ©for©/©while©/©do-while© loop-control is augmented as follows \see{examples in \VRef[Figure]{f:LoopControlExamples}}:
+\begin{itemize}[itemsep=0pt]
+\item
+©0© is the implicit start value;
+\item
+©1© is the implicit increment value.
+\item
+The up-to range uses operator ©+=© for increment;
+\item
+The down-to range uses operator ©-=© for decrement.
 \item
 The loop index is polymorphic in the type of the comparison value N (when the start value is implicit) or the start value M.
+\begin{cfa}
+for ( i; @5@ )                                  $\C[2.5in]{// typeof(5) i; 5 is comparison value}$
+for ( i; @1.5@~5.5~0.5 )                $\C{// typeof(1.5) i; 1.5 is start value}$
+\end{cfa}
 \item
 An empty conditional implies comparison value of ©1© (true).
+\item
+A comparison N is implicit up-to exclusive range [0,N©®)®©.
+\item
+A comparison ©=© N is implicit up-to inclusive range [0,N©®]®©.
+\item
+The up-to range M ©~©\index{~@©~©} N means exclusive range [M,N©®)®©.
+\item
+The up-to range M ©~=©\index{~=@©~=©} N means inclusive range [M,N©®]®©.
+\item
+The down-to range M ©-~©\index{-~@©-~©} N means exclusive range [N,M©®)®©.
+\item
+The down-to range M ©-~=©\index{-~=@©-~=©} N means inclusive range [N,M©®]®©.
+\item
+©0© is the implicit start value;
+\item
+©1© is the implicit increment value.
+\item
+The up-to range uses operator ©+=© for increment;
+\item
+The down-to range uses operator ©-=© for decrement.
+\begin{cfa}
+while ( $\R{/*empty*/}$ )               $\C{// while ( true )}$
+for ( $\R{/*empty*/}$ )                 $\C{// for ( ; true; )}$
+do ... while ( $\R{/*empty*/}$ ) $\C{// do ... while ( true )}$
+\end{cfa}
+\item
+A comparison N is implicit up-to exclusive range [0,N\R{)}.
+\begin{cfa}
+for ( @5@ )                                             $\C{// for ( typeof(5) i; i < 5; i += 1 )}$
+\end{cfa}
+\item
+A comparison ©=© N is implicit up-to inclusive range [0,N\R{]}.
+\begin{cfa}
+for ( @=@5 )                                    $\C{// for ( typeof(5) i; i <= 5; i += 1 )}$
+\end{cfa}
+\item
+The up-to range M ©~©\index{~@©~©} N means exclusive range [M,N\R{)}.
+\begin{cfa}
+for ( 1@~@5 )                                   $\C{// for ( typeof(1) i = 1; i < 5; i += 1 )}$
+\end{cfa}
+\item
+The up-to range M ©~=©\index{~=@©~=©} N means inclusive range [M,N\R{]}.
+\begin{cfa}
+for ( 1@~=@5 )                                  $\C{// for ( typeof(1) i = 1; i <= 5; i += 1 )}$
+\end{cfa}
+\item
+The down-to range M ©-~©\index{-~@©-~©} N means exclusive range [N,M\R{)}.
+\begin{cfa}
+for ( 1@-~@5 )                                  $\C{// for ( typeof(1) i = 5; i > 0; i -= 1 )}$
+\end{cfa}
+\item
+The down-to range M ©-~=©\index{-~=@©-~=©} N means inclusive range [N,M\R{]}.
+\begin{cfa}
+for ( 1@-~=@5 )                                 $\C{// for ( typeof(1) i = 5; i >= 0; i -= 1 )}$
+\end{cfa}
 \item
 ©@© means put nothing in this field.
+\begin{cfa}
+for ( 1~$\R{@}$~2 )                             $\C{// for ( typeof(1) i = 1; /*empty*/; i += 2 )}$
+\end{cfa}
 \item
 ©:© means start another index.
+\begin{cfa}
+for ( i; 5 @:@ j; 2~12~3 )              $\C{// for ( typeof(i) i = 1, j = 2; i < 5 \&\& j < 12; i += 1, j += 3 )}\CRT$
+\end{cfa}
 \end{itemize}
 …
 \subsection{\texorpdfstring{Labelled \LstKeywordStyle{continue} / \LstKeywordStyle{break} Statement}{Labelled continue / break Statement}}
 While C provides ©continue© and ©break© statements for altering control flow, both are restricted to one level of nesting for a particular control structure.
 Unfortunately, this restriction forces programmers to use \Indexc{goto} to achieve the equivalent control-flow for more than one level of nesting.
+C ©continue© and ©break© statements, for altering control flow, are restricted to one level of nesting for a particular control structure.
+This restriction forces programmers to use \Indexc{goto} to achieve the equivalent control-flow for more than one level of nesting.
 To prevent having to switch to the ©goto©, \CFA extends the \Indexc{continue}\index{continue@©continue©!labelled}\index{labelled!continue@©continue©} and \Indexc{break}\index{break@©break©!labelled}\index{labelled!break@©break©} with a target label to support static multi-level exit\index{multi-level exit}\index{static multi-level exit}~\cite{Buhr85}, as in Java.
 For both ©continue© and ©break©, the target label must be directly associated with a ©for©, ©while© or ©do© statement;
 for ©break©, the target label can also be associated with a ©switch©, ©if© or compound (©{}©) statement.
 \VRef[Figure]{f:MultiLevelExit} shows ©continue© and ©break© indicating the specific control structure, and the corresponding C program using only ©goto© and labels.
+\VRef[Figure]{f:MultiLevelExit} shows a comparison between labelled ©continue© and ©break© and the corresponding C equivalent using ©goto© and labels.
 The innermost loop has 8 exit points, which cause continuation or termination of one or more of the 7 \Index{nested control-structure}s.
 …
 \begin{lrbox}{\myboxA}
 \begin{cfa}[tabsize=3]
 ®Compound:® {
         ®Try:® try {
                 ®For:® for ( ... ) {
                         ®While:® while ( ... ) {
                                 ®Do:® do {
                                         ®If:® if ( ... ) {
                                                 ®Switch:® switch ( ... ) {
+@Compound:@ {
+        @Try:@ try {
+                @For:@ for ( ... ) {
+                        @While:@ while ( ... ) {
+                                @Do:@ do {
+                                        @If:@ if ( ... ) {
+                                                @Switch:@ switch ( ... ) {
                                                         case 3:
                                                                 ®break Compound®;
                                                                 ®break Try®;
                                                                 ®break For®;      /* or */  ®continue For®;
                                                                 ®break While®;  /* or */  ®continue While®;
                                                                 ®break Do®;      /* or */  ®continue Do®;
                                                                 ®break If®;
                                                                 ®break Switch®;
+                                                                @break Compound@;
+                                                                @break Try@;
+                                                                @break For@;      /* or */  @continue For@;
+                                                                @break While@;  /* or */  @continue While@;
+                                                                @break Do@;      /* or */  @continue Do@;
+                                                                @break If@;
+                                                                @break Switch@;
                                                         } // switch
                                                 } else {
                                                         ... ®break If®; ...     // terminate if
+                                                        ... @break If@; ...     // terminate if
                                                 } // if
                                 } while ( ... ); // do
                         } // while
                 } // for
         } ®finally® { // always executed
+        } @finally@ { // always executed
         } // try
 } // compound
 …
+{
                 ®ForC:® for ( ... ) {
                         ®WhileC:® while ( ... ) {
                                 ®DoC:® do {
+                @ForC:@ for ( ... ) {
+                        @WhileC:@ while ( ... ) {
+                                @DoC:@ do {
                                         if ( ... ) {
                                                 switch ( ... ) {
                                                         case 3:
                                                                 ®goto Compound®;
                                                                 ®goto Try®;
                                                                 ®goto ForB®;      /* or */  ®goto ForC®;
                                                                 ®goto WhileB®;  /* or */  ®goto WhileC®;
                                                                 ®goto DoB®;      /* or */  ®goto DoC®;
                                                                 ®goto If®;
                                                                 ®goto Switch®;
                                                         } ®Switch:® ;
+                                                                @goto Compound@;
+                                                                @goto Try@;
+                                                                @goto ForB@;      /* or */  @goto ForC@;
+                                                                @goto WhileB@;  /* or */  @goto WhileC@;
+                                                                @goto DoB@;      /* or */  @goto DoC@;
+                                                                @goto If@;
+                                                                @goto Switch@;
+                                                        } @Switch:@ ;
                                                 } else {
                                                         ... ®goto If®; ...      // terminate if
                                                 } ®If:®;
                                 } while ( ... ); ®DoB:® ;
                         } ®WhileB:® ;
                 } ®ForB:® ;
 } ®Compound:® ;
+                                                        ... @goto If@; ...      // terminate if
+                                                } @If:@;
+                                } while ( ... ); @DoB:@ ;
+                        } @WhileB:@ ;
+                } @ForB:@ ;
+} @Compound:@ ;
 \end{cfa}
 \end{lrbox}
 \subfloat[\CFA]{\label{f:CFibonacci}\usebox\myboxA}
 \hspace{2pt}
+\hspace{3pt}
 \vrule
 \hspace{2pt}
+\hspace{3pt}
 \subfloat[C]{\label{f:CFAFibonacciGen}\usebox\myboxB}
 \caption{Multi-level Exit}
 …
 This restriction prevents missing declarations and/or initializations at the start of a control structure resulting in undefined behaviour.
 \end{itemize}
 The advantage of the labelled ©continue©/©break© is allowing static multi-level exits without having to use the ©goto© statement, and tying control flow to the target control structure rather than an arbitrary point in a program.
+The advantage of the labelled ©continue©/©break© is allowing static multi-level exits without having to use the ©goto© statement, and tying control flow to the target control structure rather than an arbitrary point in a program via a label.
 Furthermore, the location of the label at the \emph{beginning} of the target control structure informs the reader (\Index{eye candy}) that complex control-flow is occurring in the body of the control structure.
 With ©goto©, the label is at the end of the control structure, which fails to convey this important clue early enough to the reader.
 …
 %\section{\texorpdfstring{\protect\lstinline@with@ Statement}{with Statement}}
 \section{\texorpdfstring{\LstKeywordStyle{with} Statement}{with Statement}}
+%\subsection{\texorpdfstring{\protect\lstinline@with@ Statement}{with Statement}}
+\subsection{\texorpdfstring{\LstKeywordStyle{with} Statement}{with Statement}}
 \label{s:WithStatement}
 Grouping heterogeneous data into \newterm{aggregate}s (structure/union) is a common programming practice, and an aggregate can be further organized into more complex structures, such as arrays and containers:
 \begin{cfa}
 struct S { §\C{// aggregate}§
         char c; §\C{// fields}§
         int i;
         double d;
+Grouping heterogeneous data into an \newterm{aggregate} (structure/union) is a common programming practice, and aggregates may be nested:
+\begin{cfa}
+struct Person {                                                         $\C{// aggregate}$
+        struct Name { char first[20], last[20]; } name $\C{// nesting}$
+        struct Address { ... } address                  $\C{// nesting}$
+        int sex;
 };
+S s, as[10];
+\end{cfa}
+However, functions manipulating aggregates must repeat the aggregate name to access its containing fields:
+\begin{cfa}
+void f( S s ) {
+        ®s.®c; ®s.®i; ®s.®d; §\C{// access containing fields}§
+}
+\end{cfa}
+which extends to multiple levels of qualification for nested aggregates.
+A similar situation occurs in object-oriented programming, \eg \CC:
+\end{cfa}
+Functions manipulating aggregates must repeat the aggregate name to access its containing fields.
+\begin{cfa}
+Person p
+@p.@name; @p.@address; @p.@sex; $\C{// access containing fields}$
+\end{cfa}
+which extends to multiple levels of qualification for nested aggregates and multiple aggregates.
+\begin{cfa}
+struct Ticket { ... } t;
+@p.name@.first; @p.address@.street;             $\C{// access nested fields}$
+@t.@departure; @t.@cost;                                $\C{// access multiple aggregate}$
+\end{cfa}
+Repeated aggregate qualification is tedious and makes code difficult to read.
+Therefore, reducing aggregate qualification is a useful language design goal.
+C allows unnamed nested aggregates that open their scope into the containing aggregate.
+This feature is used to group fields for attributes and/or with ©union© aggregates.
+\begin{cfa}
+struct S {
+        struct { int g,  h; } __attribute__(( aligned(64) ));
+        int tag;
+        union {
+                struct { char c1,  c2; } __attribute__(( aligned(128) ));
+                struct { int i1,  i2; };
+                struct { double d1,  d2; };
+        };
+};
+s.g; s.h; s.tag; s.c1; s.c2; s.i1; s.i2; s.d1; s.d2;
+\end{cfa}
+Object-oriented languages reduce qualification for class variables within member functions, \eg \CC:
 \begin{C++}
 struct S {
+        char c; §\C{// fields}§
+        int i;
+        double d;
+        void f() { §\C{// implicit ``this'' aggregate}§
+                ®this->®c; ®this->®i; ®this->®d; §\C{// access containing fields}§
+        char @c@;   int @i@;   double @d@;
+        void f( /* S * this */ ) {                              $\C{// implicit ``this'' parameter}$
+                @c@;   @i@;   @d@;                                      $\C{// this->c; this->i; this->d;}$
+        }
+}
 \end{C++}
+Object-oriented nesting of member functions in a \lstinline[language=C++]@class/struct@ allows eliding \lstinline[language=C++]@this->@ because of lexical scoping.
+However, for other aggregate parameters, qualification is necessary:
+\begin{cfa}
+struct T { double m, n; };
+int S::f( T & t ) { §\C{// multiple aggregate parameters}§
+        c; i; d; §\C{\color{red}// this--{\textgreater}.c, this--{\textgreater}.i, this--{\textgreater}.d}§
+        ®t.®m; ®t.®n; §\C{// must qualify}§
+}
+\end{cfa}
+To simplify the programmer experience, \CFA provides a ©with© statement (see Pascal~\cite[\S~4.F]{Pascal}) to elide aggregate qualification to fields by opening a scope containing the field identifiers.
+Hence, the qualified fields become variables with the side-effect that it is easier to optimizing field references in a block.
+\begin{cfa}
+void f( S & this ) ®with ( this )® { §\C{// with statement}§
+        c; i; d; §\C{\color{red}// this.c, this.i, this.d}§
+In general, qualification is elided for the variables and functions in the lexical scopes visible from a member function.
+However, qualification is necessary for name shadowing and explicit aggregate parameters.
+\begin{cfa}
+struct T {
+        char @m@;   int @i@;   double @n@;              $\C{// derived class variables}$
+};
+struct S : public T {
+        char @c@;   int @i@;   double @d@;              $\C{// class variables}$
+        void g( double @d@, T & t ) {
+                d;   @t@.m;   @t@.i;   @t@.n;           $\C{// function parameter}$
+                c;   i;   @this->@d;   @S::@d;          $\C{// class S variables}$
+                m;   @T::@i;   n;                                       $\C{// class T variables}$
+        }
+};
+\end{cfa}
+Note the three different forms of qualification syntax in \CC, ©.©, ©->©, ©::©, which is confusing.
+Since \CFA in not object-oriented, it has no implicit parameter with its implicit qualification.
+Instead \CFA introduces a general mechanism using the ©with© statement \see{Pascal~\cite[\S~4.F]{Pascal}} to explicitly elide aggregate qualification by opening a scope containing the field identifiers.
+Hence, the qualified fields become variables with the side-effect that it is simpler to write, easier to read, and optimize field references in a block.
+\begin{cfa}
+void f( S & this ) @with ( this )@ {            $\C{// with statement}$
+        @c@;   @i@;   @d@;                                              $\C{// this.c, this.i, this.d}$
+}
 \end{cfa}
 with the generality of opening multiple aggregate-parameters:
 \begin{cfa}
+void f( S & s, T & t ) ®with ( s, t )® { §\C{// multiple aggregate parameters}§
+        c; i; d; §\C{\color{red}// s.c, s.i, s.d}§
+        m; n; §\C{\color{red}// t.m, t.n}§
+}
+\end{cfa}
+In detail, the ©with© statement has the form:
+\begin{cfa}
+§\emph{with-statement}§:
+        'with' '(' §\emph{expression-list}§ ')' §\emph{compound-statement}§
+\end{cfa}
+and may appear as the body of a function or nested within a function body.
+Each expression in the expression-list provides a type and object.
+The type must be an aggregate type.
+void g( S & s, T & t ) @with ( s, t )@ {        $\C{// multiple aggregate parameters}$
+        c;   @s.@i;   d;                                                $\C{// s.c, s.i, s.d}$
+        m;   @t.@i;   n;                                                $\C{// t.m, t.i, t.n}$
+}
+\end{cfa}
+where qualification is only necessary to disambiguate the shadowed variable ©i©.
+In detail, the ©with© statement may appear as the body of a function or nested within a function body.
+The ©with© clause takes a list of expressions, where each expression provides an aggregate type and object.
 (Enumerations are already opened.)
+The object is the implicit qualifier for the open structure-fields.
+To open a pointer type, the pointer must be dereferenced to obtain a reference to the aggregate type.
+\begin{cfa}
+S * sp;
+with ( *sp ) { ... }
+\end{cfa}
+The expression object is the implicit qualifier for the open structure-fields.
+\CFA's ability to overload variables \see{\VRef{s:VariableOverload}} and use the left-side of assignment in type resolution means most fields with the same name but different types are automatically disambiguated, eliminating qualification.
 All expressions in the expression list are open in parallel within the compound statement.
 This semantic is different from Pascal, which nests the openings from left to right.
 The difference between parallel and nesting occurs for fields with the same name and type:
 \begin{cfa}
+struct S { int ®i®; int j; double m; } s, w;
+struct T { int ®i®; int k; int m; } t, w;
+with ( s, t ) {
+        j + k; §\C{// unambiguous, s.j + t.k}§
+        m = 5.0; §\C{// unambiguous, t.m = 5.0}§
+        m = 1; §\C{// unambiguous, s.m = 1}§
+        int a = m; §\C{// unambiguous, a = s.i }§
+        double b = m; §\C{// unambiguous, b = t.m}§
+        int c = s.i + t.i; §\C{// unambiguous, qualification}§
+        (double)m; §\C{// unambiguous, cast}§
+}
+\end{cfa}
+For parallel semantics, both ©s.i© and ©t.i© are visible, so ©i© is ambiguous without qualification;
+for nested semantics, ©t.i© hides ©s.i©, so ©i© implies ©t.i©.
+\CFA's ability to overload variables means fields with the same name but different types are automatically disambiguated, eliminating most qualification when opening multiple aggregates.
+Qualification or a cast is used to disambiguate.
+There is an interesting problem between parameters and the function-body ©with©, \eg:
+\begin{cfa}
+void ?{}( S & s, int i ) with ( s ) { §\C{// constructor}§
+        ®s.i = i;®  j = 3;  m = 5.5; §\C{// initialize fields}§
+struct Q { int @i@; int k; int @m@; } q, w;
+struct R { int @i@; int j; double @m@; } r, w;
+with ( r, q ) {
+        j + k;                                                                  $\C{// unambiguous, r.j + q.k}$
+        m = 5.0;                                                                $\C{// unambiguous, q.m = 5.0}$
+        m = 1;                                                                  $\C{// unambiguous, r.m = 1}$
+        int a = m;                                                              $\C{// unambiguous, a = r.i }$
+        double b = m;                                                   $\C{// unambiguous, b = q.m}$
+        int c = r.i + q.i;                                              $\C{// disambiguate with qualification}$
+        (double)m;                                                              $\C{// disambiguate with cast}$
+}
+\end{cfa}
+For parallel semantics, both ©r.i© and ©q.i© are visible, so ©i© is ambiguous without qualification;
+for nested semantics, ©q.i© hides ©r.i©, so ©i© implies ©q.i©.
+Pascal nested-semantics is possible by nesting ©with© statements.
+\begin{cfa}
+with ( r ) {
+        i;                                                                              $\C{// unambiguous, r.i}$
+        with ( q ) {
+                i;                                                                      $\C{// unambiguous, q.i}$
+        }
+}
+\end{cfa}
+A cast or qualification can be used to disambiguate variables within a ©with© \emph{statement}.
+A cast can be used to disambiguate among overload variables in a ©with© \emph{expression}:
+\begin{cfa}
+with ( w ) { ... }                                                      $\C{// ambiguous, same name and no context}$
+with ( (Q)w ) { ... }                                           $\C{// unambiguous, cast}$
+\end{cfa}
+Because there is no left-side in the ©with© expression to implicitly disambiguate between the ©w© variables, it is necessary to explicitly disambiguate by casting ©w© to type ©Q© or ©R©.
+Finally, there is an interesting problem between parameters and the function-body ©with©, \eg:
+\begin{cfa}
+void ?{}( S & s, int i ) with ( s ) { $\C{// constructor}$
+        @s.i = i;@  j = 3;  m = 5.5; $\C{// initialize fields}$
+}
 \end{cfa}
 …
 and implicitly opened \emph{after} a function-body open, to give them higher priority:
 \begin{cfa}
+void ?{}( S & s, int ®i® ) with ( s ) ®with( §\emph{\color{red}params}§ )® {
+        s.i = ®i®; j = 3; m = 5.5;
+}
+\end{cfa}
+Finally, a cast may be used to disambiguate among overload variables in a ©with© expression:
+\begin{cfa}
+with ( w ) { ... } §\C{// ambiguous, same name and no context}§
+with ( (S)w ) { ... } §\C{// unambiguous, cast}§
+\end{cfa}
+and ©with© expressions may be complex expressions with type reference (see Section~\ref{s:References}) to aggregate:
+% \begin{cfa}
+% struct S { int i, j; } sv;
+% with ( sv ) { §\C{// implicit reference}§
+%       S & sr = sv;
+%       with ( sr ) { §\C{// explicit reference}§
+%               S * sp = &sv;
+%               with ( *sp ) { §\C{// computed reference}§
+%                       i = 3; j = 4; §\C{\color{red}// sp--{\textgreater}i, sp--{\textgreater}j}§
+%               }
+%               i = 2; j = 3; §\C{\color{red}// sr.i, sr.j}§
+%       }
+%       i = 1; j = 2; §\C{\color{red}// sv.i, sv.j}§
+% }
+% \end{cfa}
+In \Index{object-oriented} programming, there is an implicit first parameter, often names \textbf{©self©} or \textbf{©this©}, which is elided.
+\begin{C++}
+class C {
+        int i, j;
+        int mem() { §\C{\color{red}// implicit "this" parameter}§
+                i = 1; §\C{\color{red}// this->i}§
+                j = 2; §\C{\color{red}// this->j}§
+        }
+}
+\end{C++}
+Since \CFA is non-object-oriented, the equivalent object-oriented program looks like:
+\begin{cfa}
+struct S { int i, j; };
+int mem( S & ®this® ) { §\C{// explicit "this" parameter}§
+        ®this.®i = 1; §\C{// "this" is not elided}§
+        ®this.®j = 2;
+}
+\end{cfa}
+but it is cumbersome having to write ``©this.©'' many times in a member.
+\CFA provides a ©with© clause/statement (see Pascal~\cite[\S~4.F]{Pascal}) to elided the "©this.©" by opening a scope containing field identifiers, changing the qualified fields into variables and giving an opportunity for optimizing qualified references.
+\begin{cfa}
+int mem( S & this ) ®with( this )® { §\C{// with clause}§
+        i = 1; §\C{\color{red}// this.i}§
+        j = 2; §\C{\color{red}// this.j}§
+}
+\end{cfa}
+which extends to multiple routine parameters:
+\begin{cfa}
+struct T { double m, n; };
+int mem2( S & this1, T & this2 ) ®with( this1, this2 )® {
+        i = 1; j = 2;
+        m = 1.0; n = 2.0;
+}
+\end{cfa}
+The statement form is used within a block:
+\begin{cfa}
+int foo() {
+        struct S1 { ... } s1;
+        struct S2 { ... } s2;
+        ®with( s1 )® { §\C{// with statement}§
+                // access fields of s1 without qualification
+                ®with s2® { §\C{// nesting}§
+                        // access fields of s1 and s2 without qualification
+                }
+        }
+        ®with s1, s2® {
+                // access unambiguous fields of s1 and s2 without qualification
+        }
+}
+\end{cfa}
+When opening multiple structures, fields with the same name and type are ambiguous and must be fully qualified.
+For fields with the same name but different type, context/cast can be used to disambiguate.
+\begin{cfa}
+struct S { int i; int j; double m; } a, c;
+struct T { int i; int k; int m } b, c;
+with( a, b )
+{
+}
+\end{cfa}
+\begin{comment}
+The components in the "with" clause
+  with a, b, c { ... }
+serve 2 purposes: each component provides a type and object. The type must be a
+structure type. Enumerations are already opened, and I think a union is opened
+to some extent, too. (Or is that just unnamed unions?) The object is the target
+that the naked structure-fields apply to. The components are open in "parallel"
+at the scope of the "with" clause/statement, so opening "a" does not affect
+opening "b", etc. This semantic is different from Pascal, which nests the
+openings.
+Having said the above, it seems reasonable to allow a "with" component to be an
+expression. The type is the static expression-type and the object is the result
+of the expression. Again, the type must be an aggregate. Expressions require
+parenthesis around the components.
+  with( a, b, c ) { ... }
+Does this now make sense?
+Having written more CFA code, it is becoming clear to me that I *really* want
+the "with" to be implemented because I hate having to type all those object
+names for fields. It's a great way to drive people away from the language.
+\end{comment}
+void ?{}( S & s, int @i@ ) with ( s ) @with( $\emph{\R{params}}$ )@ { // syntax not allowed, illustration only
+        s.i = @i@; j = 3; m = 5.5;
+}
+\end{cfa}
+This implicit semantic matches with programmer expectation.
 …
 Non-local transfer can cause stack unwinding, \ie non-local routine termination, depending on the kind of raise.
 \begin{cfa}
 exception_t E {}; §\C{// exception type}§
+exception_t E {}; $\C{// exception type}$
 void f(...) {
         ... throw E{}; ... §\C{// termination}§
         ... throwResume E{}; ... §\C{// resumption}§
+        ... throw E{}; ... $\C{// termination}$
+        ... throwResume E{}; ... $\C{// resumption}$
+}
 try {
         f(...);
 } catch( E e ; §boolean-predicate§ ) {          §\C{// termination handler}§
+} catch( E e ; $boolean-predicate$ ) {          $\C{// termination handler}$
         // recover and continue
 } catchResume( E e ; §boolean-predicate§ ) { §\C{// resumption handler}§
+} catchResume( E e ; $boolean-predicate$ ) { $\C{// resumption handler}$
         // repair and return
 } finally {
 …
 \end{cfa}
 The kind of raise and handler match: ©throw© with ©catch© and ©throwResume© with ©catchResume©.
 Then the exception type must match along with any additonal predicate must be true.
+Then the exception type must match along with any additional predicate must be true.
 The ©catch© and ©catchResume© handlers may appear in any oder.
 However, the ©finally© clause must appear at the end of the ©try© statement.
 …
 For example, a routine returning a \Index{pointer} to an array of integers is defined and used in the following way:
 \begin{cfa}
 int ®(*®f®())[®5®]® {...}; §\C{// definition}§
  ... ®(*®f®())[®3®]® += 1; §\C{// usage}§
+int @(*@f@())[@5@]@ {...}; $\C{// definition}$
+ ... @(*@f@())[@3@]@ += 1; $\C{// usage}$
 \end{cfa}
 Essentially, the return type is wrapped around the routine name in successive layers (like an \Index{onion}).
 …
 \begin{tabular}{@{}l@{\hspace{3em}}l@{}}
 \multicolumn{1}{c@{\hspace{3em}}}{\textbf{\CFA}}        & \multicolumn{1}{c}{\textbf{C}}        \\
 \begin{cfa}
 ß[5] *ß ®int® x1;
 ß* [5]ß ®int® x2;
 ß[* [5] int]ß f®( int p )®;
+\begin{cfa}[moredelim={**[is][\color{blue}]{\#}{\#}}]
+#[5] *# @int@ x1;
+#* [5]# @int@ x2;
+#[* [5] int]# f@( int p )@;
 \end{cfa}
+&
 \begin{cfa}
 ®int® ß*ß x1 ß[5]ß;
 ®int® ß(*ßx2ß)[5]ß;
 ßint (*ßf®( int p )®ß)[5]ß;
+\begin{cfa}[moredelim={**[is][\color{blue}]{\#}{\#}}]
+@int@ #*# x1 #[5]#;
+@int@ #(*#x2#)[5]#;
+#int (*#f@( int p )@#)[5]#;
 \end{cfa}
 \end{tabular}
 …
 \multicolumn{1}{c@{\hspace{3em}}}{\textbf{\CFA}}        & \multicolumn{1}{c}{\textbf{C}}        \\
 \begin{cfa}
 ®*® int x, y;
+@*@ int x, y;
 \end{cfa}
+&
 \begin{cfa}
 int ®*®x, ®*®y;
+int @*@x, @*@y;
 \end{cfa}
 \end{tabular}
 …
 \multicolumn{1}{c@{\hspace{3em}}}{\textbf{\CFA}}        & \multicolumn{1}{c}{\textbf{C}}        \\
 \begin{cfa}
 ®*® int x;
+@*@ int x;
 int y;
 \end{cfa}
+&
 \begin{cfa}
 int ®*®x, y;
+int @*@x, y;
 \end{cfa}
 …
 \section{Pointer / Reference}
+\label{s:PointerReference}
 C provides a \newterm{pointer type};
 …
+&
 \begin{cfa}
 int * ®const® x = (int *)100
+int * @const@ x = (int *)100
 *x = 3;                 // implicit dereference
 int * ®const® y = (int *)104;
+int * @const@ y = (int *)104;
 *y = *x;                        // implicit dereference
 \end{cfa}
 …
 \begin{tabular}{@{}l@{\hspace{2em}}l@{}}
 \begin{cfa}
 int x, y, ®*® p1, ®*® p2, ®**® p3;
 p1 = ®&®x;     // p1 points to x
+int x, y, @*@ p1, @*@ p2, @**@ p3;
+p1 = @&@x;     // p1 points to x
 p2 = p1;     // p2 points to x
 p1 = ®&®y;     // p1 points to y
+p1 = @&@y;     // p1 points to y
 p3 = &p2;  // p3 points to p2
 \end{cfa}
 …
 For example, \Index*{Algol68}~\cite{Algol68} infers pointer dereferencing to select the best meaning for each pointer usage
 \begin{cfa}
 p2 = p1 + x; §\C{// compiler infers *p2 = *p1 + x;}§
+p2 = p1 + x; $\C{// compiler infers *p2 = *p1 + x;}$
 \end{cfa}
 Algol68 infers the following dereferencing ©*p2 = *p1 + x©, because adding the arbitrary integer value in ©x© to the address of ©p1© and storing the resulting address into ©p2© is an unlikely operation.
 …
 In C, objects of pointer type always manipulate the pointer object's address:
 \begin{cfa}
 p1 = p2; §\C{// p1 = p2\ \ rather than\ \ *p1 = *p2}§
 p2 = p1 + x; §\C{// p2 = p1 + x\ \ rather than\ \ *p2 = *p1 + x}§
+p1 = p2; $\C{// p1 = p2\ \ rather than\ \ *p1 = *p2}$
+p2 = p1 + x; $\C{// p2 = p1 + x\ \ rather than\ \ *p2 = *p1 + x}$
 \end{cfa}
 even though the assignment to ©p2© is likely incorrect, and the programmer probably meant:
 \begin{cfa}
 p1 = p2; §\C{// pointer address assignment}§
+®*®p2 = ®*®p1 + x; §\C{// pointed-to value assignment / operation}§
+p1 = p2; $\C{// pointer address assignment}$
+@*@p2 = @*@p1 + x; $\C{// pointed-to value assignment / operation}$
 \end{cfa}
 The C semantics work well for situations where manipulation of addresses is the primary meaning and data is rarely accessed, such as storage management (©malloc©/©free©).
 …
 To support this common case, a reference type is introduced in \CFA, denoted by ©&©, which is the opposite dereference semantics to a pointer type, making the value at the pointed-to location the implicit semantics for dereferencing (similar but not the same as \CC \Index{reference type}s).
 \begin{cfa}
 int x, y, ®&® r1, ®&® r2, ®&&® r3;
+®&®r1 = &x; §\C{// r1 points to x}§
+®&®r2 = &r1; §\C{// r2 points to x}§
+®&®r1 = &y; §\C{// r1 points to y}§
+®&&®r3 = ®&®&r2; §\C{// r3 points to r2}§
 r2 = ((r1 + r2) * (r3 - r1)) / (r3 - 15); §\C{// implicit dereferencing}§
+int x, y, @&@ r1, @&@ r2, @&&@ r3;
+@&@r1 = &x; $\C{// r1 points to x}$
+@&@r2 = &r1; $\C{// r2 points to x}$
+@&@r1 = &y; $\C{// r1 points to y}$
+@&&@r3 = @&@&r2; $\C{// r3 points to r2}$
+r2 = ((r1 + r2) * (r3 - r1)) / (r3 - 15); $\C{// implicit dereferencing}$
 \end{cfa}
 Except for auto-dereferencing by the compiler, this reference example is the same as the previous pointer example.
 …
 One way to conceptualize a reference is via a rewrite rule, where the compiler inserts a dereference operator before the reference variable for each reference qualifier in a declaration, so the previous example becomes:
 \begin{cfa}
 ®*®r2 = ((®*®r1 + ®*®r2) ®*® (®**®r3 - ®*®r1)) / (®**®r3 - 15);
+@*@r2 = ((@*@r1 + @*@r2) @*@ (@**@r3 - @*@r1)) / (@**@r3 - 15);
 \end{cfa}
 When a reference operation appears beside a dereference operation, \eg ©&*©, they cancel out.
 …
 For a \CFA reference type, the cancellation on the left-hand side of assignment leaves the reference as an address (\Index{lvalue}):
 \begin{cfa}
 (&®*®)r1 = &x; §\C{// (\&*) cancel giving address in r1 not variable pointed-to by r1}§
+(&@*@)r1 = &x; $\C{// (\&*) cancel giving address in r1 not variable pointed-to by r1}$
 \end{cfa}
 Similarly, the address of a reference can be obtained for assignment or computation (\Index{rvalue}):
 \begin{cfa}
 (&(&®*®)®*®)r3 = &(&®*®)r2; §\C{// (\&*) cancel giving address in r2, (\&(\&*)*) cancel giving address in r3}§
+(&(&@*@)@*@)r3 = &(&@*@)r2; $\C{// (\&*) cancel giving address in r2, (\&(\&*)*) cancel giving address in r3}$
 \end{cfa}
 Cancellation\index{cancellation!pointer/reference}\index{pointer!cancellation} works to arbitrary depth.
 …
 int x, *p1 = &x, **p2 = &p1, ***p3 = &p2,
                  &r1 = x,    &&r2 = r1,   &&&r3 = r2;
 ***p3 = 3; §\C{// change x}§
 r3 = 3; §\C{// change x, ***r3}§
 **p3 = ...; §\C{// change p1}§
 &r3 = ...; §\C{// change r1, (\&*)**r3, 1 cancellation}§
 *p3 = ...; §\C{// change p2}§
 &&r3 = ...; §\C{// change r2, (\&(\&*)*)*r3, 2 cancellations}§
 &&&r3 = p3; §\C{// change r3 to p3, (\&(\&(\&*)*)*)r3, 3 cancellations}§
+***p3 = 3; $\C{// change x}$
+r3 = 3; $\C{// change x, ***r3}$
+**p3 = ...; $\C{// change p1}$
+&r3 = ...; $\C{// change r1, (\&*)**r3, 1 cancellation}$
+*p3 = ...; $\C{// change p2}$
+&&r3 = ...; $\C{// change r2, (\&(\&*)*)*r3, 2 cancellations}$
+&&&r3 = p3; $\C{// change r3 to p3, (\&(\&(\&*)*)*)r3, 3 cancellations}$
 \end{cfa}
 Furthermore, both types are equally performant, as the same amount of dereferencing occurs for both types.
 …
 As for a pointer type, a reference type may have qualifiers:
 \begin{cfa}
 const int cx = 5; §\C{// cannot change cx;}§
 const int & cr = cx; §\C{// cannot change what cr points to}§
+®&®cr = &cx; §\C{// can change cr}§
 cr = 7; §\C{// error, cannot change cx}§
 int & const rc = x; §\C{// must be initialized}§
+®&®rc = &x; §\C{// error, cannot change rc}§
 const int & const crc = cx; §\C{// must be initialized}§
 crc = 7; §\C{// error, cannot change cx}§
+®&®crc = &cx; §\C{// error, cannot change crc}§
+const int cx = 5; $\C{// cannot change cx;}$
+const int & cr = cx; $\C{// cannot change what cr points to}$
+@&@cr = &cx; $\C{// can change cr}$
+cr = 7; $\C{// error, cannot change cx}$
+int & const rc = x; $\C{// must be initialized}$
+@&@rc = &x; $\C{// error, cannot change rc}$
+const int & const crc = cx; $\C{// must be initialized}$
+crc = 7; $\C{// error, cannot change cx}$
+@&@crc = &cx; $\C{// error, cannot change crc}$
 \end{cfa}
 Hence, for type ©& const©, there is no pointer assignment, so ©&rc = &x© is disallowed, and \emph{the address value cannot be the null pointer unless an arbitrary pointer is coerced\index{coercion} into the reference}:
 \begin{cfa}
 int & const cr = *0; §\C{// where 0 is the int * zero}§
+int & const cr = *0; $\C{// where 0 is the int * zero}$
 \end{cfa}
 Note, constant reference-types do not prevent \Index{addressing errors} because of explicit storage-management:
 …
 cr = 5;
 free( &cr );
 cr = 7; §\C{// unsound pointer dereference}§
+cr = 7; $\C{// unsound pointer dereference}$
 \end{cfa}
 The position of the ©const© qualifier \emph{after} the pointer/reference qualifier causes confuse for C programmers.
 The ©const© qualifier cannot be moved before the pointer/reference qualifier for C style-declarations;
 \CFA-style declarations (see \VRef{s:AlternativeDeclarations}) attempt to address this issue:
+\CFA-style declarations \see{\VRef{s:AlternativeDeclarations}} attempt to address this issue:
 \begin{cquote}
 \begin{tabular}{@{}l@{\hspace{3em}}l@{}}
 \multicolumn{1}{c@{\hspace{3em}}}{\textbf{\CFA}}        & \multicolumn{1}{c}{\textbf{C}}        \\
 \begin{cfa}
 ®const® * ®const® * const int ccp;
 ®const® & ®const® & const int ccr;
+@const@ * @const@ * const int ccp;
+@const@ & @const@ & const int ccr;
 \end{cfa}
+&
 \begin{cfa}
 const int * ®const® * ®const® ccp;
+const int * @const@ * @const@ ccp;
 \end{cfa}
 …
 Finally, like pointers, references are usable and composable with other type operators and generators.
 \begin{cfa}
 int w, x, y, z, & ar[3] = { x, y, z }; §\C{// initialize array of references}§
 &ar[1] = &w; §\C{// change reference array element}§
 typeof( ar[1] ) p; §\C{// (gcc) is int, \ie the type of referenced object}§
 typeof( &ar[1] ) q; §\C{// (gcc) is int \&, \ie the type of reference}§
 sizeof( ar[1] ) == sizeof( int ); §\C{// is true, \ie the size of referenced object}§
 sizeof( &ar[1] ) == sizeof( int *) §\C{// is true, \ie the size of a reference}§
+int w, x, y, z, & ar[3] = { x, y, z }; $\C{// initialize array of references}$
+&ar[1] = &w; $\C{// change reference array element}$
+typeof( ar[1] ) p; $\C{// (gcc) is int, \ie the type of referenced object}$
+typeof( &ar[1] ) q; $\C{// (gcc) is int \&, \ie the type of reference}$
+sizeof( ar[1] ) == sizeof( int ); $\C{// is true, \ie the size of referenced object}$
+sizeof( &ar[1] ) == sizeof( int *) $\C{// is true, \ie the size of a reference}$
 \end{cfa}
 In contrast to \CFA reference types, \Index*[C++]{\CC{}}'s reference types are all ©const© references, preventing changes to the reference address, so only value assignment is possible, which eliminates half of the \Index{address duality}.
 Also, \CC does not allow \Index{array}s\index{array!reference} of reference\footnote{
 The reason for disallowing arrays of reference is unknown, but possibly comes from references being ethereal (like a textual macro), and hence, replaceable by the referant object.}
+The reason for disallowing arrays of reference is unknown, but possibly comes from references being ethereal (like a textual macro), and hence, replaceable by the referent object.}
 \Index*{Java}'s reference types to objects (all Java objects are on the heap) are like C pointers, which always manipulate the address, and there is no (bit-wise) object assignment, so objects are explicitly cloned by shallow or deep copying, which eliminates half of the address duality.
 …
 Therefore, for pointer/reference initialization, the initializing value must be an address not a value.
 \begin{cfa}
 int * p = &x; §\C{// assign address of x}§
+®int * p = x;® §\C{// assign value of x}§
 int & r = x; §\C{// must have address of x}§
+int * p = &x; $\C{// assign address of x}$
+@int * p = x;@ $\C{// assign value of x}$
+int & r = x; $\C{// must have address of x}$
 \end{cfa}
 Like the previous example with C pointer-arithmetic, it is unlikely assigning the value of ©x© into a pointer is meaningful (again, a warning is usually given).
 …
 Similarly, when a reference type is used for a parameter/return type, the call-site argument does not require a reference operator for the same reason.
 \begin{cfa}
 int & f( int & r ); §\C{// reference parameter and return}§
 z = f( x ) + f( y ); §\C{// reference operator added, temporaries needed for call results}§
+int & f( int & r ); $\C{// reference parameter and return}$
+z = f( x ) + f( y ); $\C{// reference operator added, temporaries needed for call results}$
 \end{cfa}
 Within routine ©f©, it is possible to change the argument by changing the corresponding parameter, and parameter ©r© can be locally reassigned within ©f©.
 …
 When a pointer/reference parameter has a ©const© value (immutable), it is possible to pass literals and expressions.
 \begin{cfa}
 void f( ®const® int & cr );
 void g( ®const® int * cp );
 f( 3 );                   g( ®&®3 );
 f( x + y );             g( ®&®(x + y) );
+void f( @const@ int & cr );
+void g( @const@ int * cp );
+f( 3 );                   g( @&@3 );
+f( x + y );             g( @&@(x + y) );
 \end{cfa}
 Here, the compiler passes the address to the literal 3 or the temporary for the expression ©x + y©, knowing the argument cannot be changed through the parameter.
 …
 void f( int & r );
 void g( int * p );
 f( 3 );                   g( ®&®3 ); §\C{// compiler implicit generates temporaries}§
 f( x + y );             g( ®&®(x + y) ); §\C{// compiler implicit generates temporaries}§
+f( 3 );                   g( @&@3 ); $\C{// compiler implicit generates temporaries}$
+f( x + y );             g( @&@(x + y) ); $\C{// compiler implicit generates temporaries}$
 \end{cfa}
 Essentially, there is an implicit \Index{rvalue} to \Index{lvalue} conversion in this case.\footnote{
 …
 \begin{cfa}
 void f( int i );
 void (* fp)( int ); §\C{// routine pointer}§
 fp = f; §\C{// reference initialization}§
 fp = &f; §\C{// pointer initialization}§
 fp = *f; §\C{// reference initialization}§
 fp(3); §\C{// reference invocation}§
 (*fp)(3); §\C{// pointer invocation}§
+void (* fp)( int ); $\C{// routine pointer}$
+fp = f; $\C{// reference initialization}$
+fp = &f; $\C{// pointer initialization}$
+fp = *f; $\C{// reference initialization}$
+fp(3); $\C{// reference invocation}$
+(*fp)(3); $\C{// pointer invocation}$
 \end{cfa}
 While C's treatment of routine objects has similarity to inferring a reference type in initialization contexts, the examples are assignment not initialization, and all possible forms of assignment are possible (©f©, ©&f©, ©*f©) without regard for type.
 Instead, a routine object should be referenced by a ©const© reference:
 \begin{cfa}
+®const® void (®&® fr)( int ) = f; §\C{// routine reference}§
 fr = ... §\C{// error, cannot change code}§
 &fr = ...; §\C{// changing routine reference}§
 fr( 3 ); §\C{// reference call to f}§
 (*fr)(3); §\C{// error, incorrect type}§
+@const@ void (@&@ fr)( int ) = f; $\C{// routine reference}$
+fr = ... $\C{// error, cannot change code}$
+&fr = ...; $\C{// changing routine reference}$
+fr( 3 ); $\C{// reference call to f}$
+(*fr)(3); $\C{// error, incorrect type}$
 \end{cfa}
 because the value of the routine object is a routine literal, \ie the routine code is normally immutable during execution.\footnote{
 …
 \begin{itemize}
 \item
 if ©R© is an \Index{rvalue} of type ©T &©$_1\cdots$ ©&©$_r$, where $r \ge 1$ references (©&© symbols), than ©&R© has type ©T ®*®&©$_{\color{red}2}\cdots$ ©&©$_{\color{red}r}$, \ie ©T© pointer with $r-1$ references (©&© symbols).
 \item
 if ©L© is an \Index{lvalue} of type ©T &©$_1\cdots$ ©&©$_l$, where $l \ge 0$ references (©&© symbols), than ©&L© has type ©T ®*®&©$_{\color{red}1}\cdots$ ©&©$_{\color{red}l}$, \ie ©T© pointer with $l$ references (©&© symbols).
+if ©R© is an \Index{rvalue} of type ©T &©$_1\cdots$ ©&©$_r$, where $r \ge 1$ references (©&© symbols), than ©&R© has type ©T ©\R{©*©}©&©\R{$_2$}$\cdots$ ©&©\R{$_r$}, \ie ©T© pointer with $r-1$ references (©&© symbols).
+\item
+if ©L© is an \Index{lvalue} of type ©T &©$_1\cdots$ ©&©$_l$, where $l \ge 0$ references (©&© symbols), than ©&L© has type ©T ©\R{©*©}©&©\R{$_1$}$\cdots$ ©&©\R{$_l$}, \ie ©T© pointer with $l$ references (©&© symbols).
 \end{itemize}
 The following example shows the first rule applied to different \Index{rvalue} contexts:
 …
 int x, * px, ** ppx, *** pppx, **** ppppx;
 int & rx = x, && rrx = rx, &&& rrrx = rrx ;
 x = rrrx; §\C[2.0in]{// rrrx is an lvalue with type int \&\&\& (equivalent to x)}§
 px = &rrrx; §\C{// starting from rrrx, \&rrrx is an rvalue with type int *\&\&\& (\&x)}§
 ppx = &&rrrx; §\C{// starting from \&rrrx, \&\&rrrx is an rvalue with type int **\&\& (\&rx)}§
 pppx = &&&rrrx; §\C{// starting from \&\&rrrx, \&\&\&rrrx is an rvalue with type int ***\& (\&rrx)}§
 ppppx = &&&&rrrx; §\C{// starting from \&\&\&rrrx, \&\&\&\&rrrx is an rvalue with type int **** (\&rrrx)}§
+x = rrrx; $\C[2.0in]{// rrrx is an lvalue with type int \&\&\& (equivalent to x)}$
+px = &rrrx; $\C{// starting from rrrx, \&rrrx is an rvalue with type int *\&\&\& (\&x)}$
+ppx = &&rrrx; $\C{// starting from \&rrrx, \&\&rrrx is an rvalue with type int **\&\& (\&rx)}$
+pppx = &&&rrrx; $\C{// starting from \&\&rrrx, \&\&\&rrrx is an rvalue with type int ***\& (\&rrx)}$
+ppppx = &&&&rrrx; $\C{// starting from \&\&\&rrrx, \&\&\&\&rrrx is an rvalue with type int **** (\&rrrx)}$
 \end{cfa}
 The following example shows the second rule applied to different \Index{lvalue} contexts:
 …
 int x, * px, ** ppx, *** pppx;
 int & rx = x, && rrx = rx, &&& rrrx = rrx ;
 rrrx = 2; §\C{// rrrx is an lvalue with type int \&\&\& (equivalent to x)}§
 &rrrx = px; §\C{// starting from rrrx, \&rrrx is an rvalue with type int *\&\&\& (rx)}§
 &&rrrx = ppx; §\C{// starting from \&rrrx, \&\&rrrx is an rvalue with type int **\&\& (rrx)}§
 &&&rrrx = pppx; §\C{// starting from \&\&rrrx, \&\&\&rrrx is an rvalue with type int ***\& (rrrx)}\CRT§
+rrrx = 2; $\C{// rrrx is an lvalue with type int \&\&\& (equivalent to x)}$
+&rrrx = px; $\C{// starting from rrrx, \&rrrx is an rvalue with type int *\&\&\& (rx)}$
+&&rrrx = ppx; $\C{// starting from \&rrrx, \&\&rrrx is an rvalue with type int **\&\& (rrx)}$
+&&&rrrx = pppx; $\C{// starting from \&\&rrrx, \&\&\&rrrx is an rvalue with type int ***\& (rrrx)}\CRT$
 \end{cfa}
 …
 \begin{cfa}
 int x;
 x + 1; §\C[2.0in]{// lvalue variable (int) converts to rvalue for expression}§
+x + 1; $\C[2.0in]{// lvalue variable (int) converts to rvalue for expression}$
 \end{cfa}
 An rvalue has no type qualifiers (©cv©), so the lvalue qualifiers are dropped.
 …
 \begin{cfa}
 int x, &r = x, f( int p );
 x = ®r® + f( ®r® ); §\C{// lvalue reference converts to rvalue}§
+x = @r@ + f( @r@ ); $\C{// lvalue reference converts to rvalue}$
 \end{cfa}
 An rvalue has no type qualifiers (©cv©), so the reference qualifiers are dropped.
 …
 lvalue to reference conversion: \lstinline[deletekeywords=lvalue]@lvalue-type cv1 T@ converts to ©cv2 T &©, which allows implicitly converting variables to references.
 \begin{cfa}
 int x, &r = ®x®, f( int & p ); §\C{// lvalue variable (int) convert to reference (int \&)}§
 f( ®x® ); §\C{// lvalue variable (int) convert to reference (int \&)}§
+int x, &r = @x@, f( int & p ); $\C{// lvalue variable (int) convert to reference (int \&)}$
+f( @x@ ); $\C{// lvalue variable (int) convert to reference (int \&)}$
 \end{cfa}
 Conversion can restrict a type, where ©cv1© $\le$ ©cv2©, \eg passing an ©int© to a ©const volatile int &©, which has low cost.
 …
 \begin{cfa}
 int x, & f( int & p );
 f( ®x + 3® );   §\C[1.5in]{// rvalue parameter (int) implicitly converts to lvalue temporary reference (int \&)}§
+®&f®(...) = &x; §\C{// rvalue result (int \&) implicitly converts to lvalue temporary reference (int \&)}\CRT§
+f( @x + 3@ );   $\C[1.5in]{// rvalue parameter (int) implicitly converts to lvalue temporary reference (int \&)}$
+@&f@(...) = &x; $\C{// rvalue result (int \&) implicitly converts to lvalue temporary reference (int \&)}\CRT$
 \end{cfa}
 In both case, modifications to the temporary are inaccessible (\Index{warning}).
 …
 The point of the new syntax is to allow returning multiple values from a routine~\cite{Galletly96,CLU}, \eg:
 \begin{cfa}
 ®[ int o1, int o2, char o3 ]® f( int i1, char i2, char i3 ) {
         §\emph{routine body}§
+@[ int o1, int o2, char o3 ]@ f( int i1, char i2, char i3 ) {
+        $\emph{routine body}$
+}
 \end{cfa}
 …
 Declaration qualifiers can only appear at the start of a routine definition, \eg:
 \begin{cfa}
 ®extern® [ int x ] g( int y ) {§\,§}
+@extern@ [ int x ] g( int y ) {$\,$}
 \end{cfa}
 Lastly, if there are no output parameters or input parameters, the brackets and/or parentheses must still be specified;
 in both cases the type is assumed to be void as opposed to old style C defaults of int return type and unknown parameter types, respectively, as in:
 \begin{cfa}
 [§\,§] g(); §\C{// no input or output parameters}§
 [ void ] g( void ); §\C{// no input or output parameters}§
+[$\,$] g(); $\C{// no input or output parameters}$
+[ void ] g( void ); $\C{// no input or output parameters}$
 \end{cfa}
 …
 \begin{cfa}
 typedef int foo;
 int f( int (* foo) ); §\C{// foo is redefined as a parameter name}§
+int f( int (* foo) ); $\C{// foo is redefined as a parameter name}$
 \end{cfa}
 The string ``©int (* foo)©'' declares a C-style named-parameter of type pointer to an integer (the parenthesis are superfluous), while the same string declares a \CFA style unnamed parameter of type routine returning integer with unnamed parameter of type pointer to foo.
 …
 C-style declarations can be used to declare parameters for \CFA style routine definitions, \eg:
 \begin{cfa}
 [ int ] f( * int, int * ); §\C{// returns an integer, accepts 2 pointers to integers}§
 [ * int, int * ] f( int ); §\C{// returns 2 pointers to integers, accepts an integer}§
+[ int ] f( * int, int * ); $\C{// returns an integer, accepts 2 pointers to integers}$
+[ * int, int * ] f( int ); $\C{// returns 2 pointers to integers, accepts an integer}$
 \end{cfa}
 The reason for allowing both declaration styles in the new context is for backwards compatibility with existing preprocessor macros that generate C-style declaration-syntax, as in:
 \begin{cfa}
 #define ptoa( n, d ) int (*n)[ d ]
 int f( ptoa( p, 5 ) ) ... §\C{// expands to int f( int (*p)[ 5 ] )}§
 [ int ] f( ptoa( p, 5 ) ) ... §\C{// expands to [ int ] f( int (*p)[ 5 ] )}§
+int f( ptoa( p, 5 ) ) ... $\C{// expands to int f( int (*p)[ 5 ] )}$
+[ int ] f( ptoa( p, 5 ) ) ... $\C{// expands to [ int ] f( int (*p)[ 5 ] )}$
 \end{cfa}
 Again, programmers are highly encouraged to use one declaration form or the other, rather than mixing the forms.
 …
 \begin{minipage}{\linewidth}
 \begin{cfa}
 ®[ int x, int y ]® f() {
+@[ int x, int y ]@ f() {
         int z;
         ... x = 0; ... y = z; ...
         ®return;® §\C{// implicitly return x, y}§
+        @return;@ $\C{// implicitly return x, y}$
+}
 \end{cfa}
 …
 [ int x, int y ] f() {
         ...
 } §\C{// implicitly return x, y}§
+} $\C{// implicitly return x, y}$
 \end{cfa}
 In this case, the current values of ©x© and ©y© are returned to the calling routine just as if a ©return© had been encountered.
 …
 [ int x, int y ] f( int, x, int y ) {
         ...
 } §\C{// implicitly return x, y}§
+} $\C{// implicitly return x, y}$
 \end{cfa}
 This notation allows the compiler to eliminate temporary variables in nested routine calls.
 \begin{cfa}
 [ int x, int y ] f( int, x, int y ); §\C{// prototype declaration}§
+[ int x, int y ] f( int, x, int y ); $\C{// prototype declaration}$
 int a, b;
 [a, b] = f( f( f( a, b ) ) );
 …
 as well, parameter names are optional, \eg:
 \begin{cfa}
 [ int x ] f (); §\C{// returning int with no parameters}§
 [ * int ] g (int y); §\C{// returning pointer to int with int parameter}§
 [ ] h ( int, char ); §\C{// returning no result with int and char parameters}§
 [ * int, int ] j ( int ); §\C{// returning pointer to int and int, with int parameter}§
+[ int x ] f (); $\C{// returning int with no parameters}$
+[ * int ] g (int y); $\C{// returning pointer to int with int parameter}$
+[ ] h ( int, char ); $\C{// returning no result with int and char parameters}$
+[ * int, int ] j ( int ); $\C{// returning pointer to int and int, with int parameter}$
 \end{cfa}
 This syntax allows a prototype declaration to be created by cutting and pasting source text from the routine definition header (or vice versa).
 Like C, it is possible to declare multiple routine-prototypes in a single declaration, where the return type is distributed across \emph{all} routine names in the declaration list (see~\VRef{s:AlternativeDeclarations}), \eg:
+Like C, it is possible to declare multiple routine-prototypes in a single declaration, where the return type is distributed across \emph{all} routine names in the declaration list \see{\VRef{s:AlternativeDeclarations}}, \eg:
 \begin{cfa}
 C :             const double bar1(), bar2( int ), bar3( double );
 §\CFA§: [const double] foo(), foo( int ), foo( double ) { return 3.0; }
+$\CFA$: [const double] foo(), foo( int ), foo( double ) { return 3.0; }
 \end{cfa}
 \CFA allows the last routine in the list to define its body.
 …
 The syntax for pointers to \CFA routines specifies the pointer name on the right, \eg:
 \begin{cfa}
 * [ int x ] () fp; §\C{// pointer to routine returning int with no parameters}§
 * [ * int ] (int y) gp; §\C{// pointer to routine returning pointer to int with int parameter}§
 * [ ] (int,char) hp; §\C{// pointer to routine returning no result with int and char parameters}§
 * [ * int,int ] ( int ) jp; §\C{// pointer to routine returning pointer to int and int, with int parameter}§
+* [ int x ] () fp; $\C[2.25in]{// pointer to routine returning int with no parameters}$
+* [ * int ] (int y) gp; $\C{// pointer to routine returning pointer to int with int parameter}$
+* [ ] (int,char) hp; $\C{// pointer to routine returning no result with int and char parameters}$
+* [ * int,int ] ( int ) jp; $\C{// pointer to routine returning pointer to int and int, with int parameter}\CRT$
 \end{cfa}
 While parameter names are optional, \emph{a routine name cannot be specified};
 for example, the following is incorrect:
 \begin{cfa}
 * [ int x ] f () fp; §\C{// routine name "f" is not allowed}§
+* [ int x ] f () fp; $\C{// routine name "f" is not allowed}$
 \end{cfa}
 …
 whereas a named (keyword) call may be:
 \begin{cfa}
 p( z : 3, x : 4, y : 7 );  §\C{// rewrite $\Rightarrow$ p( 4, 7, 3 )}§
+p( z : 3, x : 4, y : 7 );  $\C{// rewrite \(\Rightarrow\) p( 4, 7, 3 )}$
 \end{cfa}
 Here the order of the arguments is unimportant, and the names of the parameters are used to associate argument values with the corresponding parameters.
 …
 For example, the following routine prototypes and definition are all valid.
 \begin{cfa}
 void p( int, int, int ); §\C{// equivalent prototypes}§
+void p( int, int, int ); $\C{// equivalent prototypes}$
 void p( int x, int y, int z );
 void p( int y, int x, int z );
 void p( int z, int y, int x );
 void p( int q, int r, int s ) {} §\C{// match with this definition}§
+void p( int q, int r, int s ) {} $\C{// match with this definition}$
 \end{cfa}
 Forcing matching parameter names in routine prototypes with corresponding routine definitions is possible, but goes against a strong tradition in C programming.
 …
 int f( int x, double y );
 f( j : 3, i : 4 ); §\C{// 1st f}§
 f( x : 7, y : 8.1 ); §\C{// 2nd f}§
 f( 4, 5 );  §\C{// ambiguous call}§
+f( j : 3, i : 4 ); $\C{// 1st f}$
+f( x : 7, y : 8.1 ); $\C{// 2nd f}$
+f( 4, 5 );  $\C{// ambiguous call}$
 \end{cfa}
 However, named arguments compound routine resolution in conjunction with conversions:
 \begin{cfa}
 f( i : 3, 5.7 ); §\C{// ambiguous call ?}§
+f( i : 3, 5.7 ); $\C{// ambiguous call ?}$
 \end{cfa}
 Depending on the cost associated with named arguments, this call could be resolvable or ambiguous.
 …
 the allowable positional calls are:
 \begin{cfa}
 p(); §\C{// rewrite $\Rightarrow$ p( 1, 2, 3 )}§
 p( 4 ); §\C{// rewrite $\Rightarrow$ p( 4, 2, 3 )}§
 p( 4, 4 ); §\C{// rewrite $\Rightarrow$ p( 4, 4, 3 )}§
 p( 4, 4, 4 ); §\C{// rewrite $\Rightarrow$ p( 4, 4, 4 )}§
+p(); $\C{// rewrite \(\Rightarrow\) p( 1, 2, 3 )}$
+p( 4 ); $\C{// rewrite \(\Rightarrow\) p( 4, 2, 3 )}$
+p( 4, 4 ); $\C{// rewrite \(\Rightarrow\) p( 4, 4, 3 )}$
+p( 4, 4, 4 ); $\C{// rewrite \(\Rightarrow\) p( 4, 4, 4 )}$
 // empty arguments
 p(  , 4, 4 ); §\C{// rewrite $\Rightarrow$ p( 1, 4, 4 )}§
 p( 4,  , 4 ); §\C{// rewrite $\Rightarrow$ p( 4, 2, 4 )}§
 p( 4, 4,   ); §\C{// rewrite $\Rightarrow$ p( 4, 4, 3 )}§
 p( 4,  ,   ); §\C{// rewrite $\Rightarrow$ p( 4, 2, 3 )}§
 p(  , 4,   ); §\C{// rewrite $\Rightarrow$ p( 1, 4, 3 )}§
 p(  ,  , 4 ); §\C{// rewrite $\Rightarrow$ p( 1, 2, 4 )}§
 p(  ,  ,   ); §\C{// rewrite $\Rightarrow$ p( 1, 2, 3 )}§
+p(  , 4, 4 ); $\C{// rewrite \(\Rightarrow\) p( 1, 4, 4 )}$
+p( 4,  , 4 ); $\C{// rewrite \(\Rightarrow\) p( 4, 2, 4 )}$
+p( 4, 4,   ); $\C{// rewrite \(\Rightarrow\) p( 4, 4, 3 )}$
+p( 4,  ,   ); $\C{// rewrite \(\Rightarrow\) p( 4, 2, 3 )}$
+p(  , 4,   ); $\C{// rewrite \(\Rightarrow\) p( 1, 4, 3 )}$
+p(  ,  , 4 ); $\C{// rewrite \(\Rightarrow\) p( 1, 2, 4 )}$
+p(  ,  ,   ); $\C{// rewrite \(\Rightarrow\) p( 1, 2, 3 )}$
 \end{cfa}
 Here the missing arguments are inserted from the default values in the parameter list.
 …
 Default values may only appear in a prototype versus definition context:
 \begin{cfa}
 void p( int x, int y = 2, int z = 3 ); §\C{// prototype: allowed}§
 void p( int, int = 2, int = 3 ); §\C{// prototype: allowed}§
 void p( int x, int y = 2, int z = 3 ) {} §\C{// definition: not allowed}§
+void p( int x, int y = 2, int z = 3 ); $\C{// prototype: allowed}$
+void p( int, int = 2, int = 3 ); $\C{// prototype: allowed}$
+void p( int x, int y = 2, int z = 3 ) {} $\C{// definition: not allowed}$
 \end{cfa}
 The reason for this restriction is to allow separate compilation.
 …
 \begin{cfa}
 p( int x, int y, int z, ... );
 p( 1, 4, 5, 6, z : 3, y : 2 ); §\C{// assume p( /* positional */, ... , /* named */ );}§
 p( 1, z : 3, y : 2, 4, 5, 6 ); §\C{// assume p( /* positional */, /* named */, ... );}§
+p( 1, 4, 5, 6, z : 3, y : 2 ); $\C{// assume p( /* positional */, ... , /* named */ );}$
+p( 1, z : 3, y : 2, 4, 5, 6 ); $\C{// assume p( /* positional */, /* named */, ... );}$
 \end{cfa}
 In the first call, it is necessary for the programmer to conceptually rewrite the call, changing named arguments into positional, before knowing where the ellipse arguments begin.
 …
 \begin{cfa}
 void p( int x, int y = 2, int z = 3... );
 p( 1, 4, 5, 6, z : 3 ); §\C{// assume p( /* positional */, ... , /* named */ );}§
 p( 1, z : 3, 4, 5, 6 ); §\C{// assume p( /* positional */, /* named */, ... );}§
+p( 1, 4, 5, 6, z : 3 ); $\C{// assume p( /* positional */, ... , /* named */ );}$
+p( 1, z : 3, 4, 5, 6 ); $\C{// assume p( /* positional */, /* named */, ... );}$
 \end{cfa}
 The first call is an error because arguments 4 and 5 are actually positional not ellipse arguments;
 …
 In the second call, the default value for y is implicitly inserted after argument 1 and the named arguments separate the positional and ellipse arguments, making it trivial to read the call.
 For these reasons, \CFA requires named arguments before ellipse arguments.
 Finally, while ellipse arguments are needed for a small set of existing C routines, like printf, the extended \CFA type system largely eliminates the need for ellipse arguments (see Section 24), making much of this discussion moot.
 Default arguments and overloading (see Section 24) are complementary.
+Finally, while ellipse arguments are needed for a small set of existing C routines, like ©printf©, the extended \CFA type system largely eliminates the need for ellipse arguments \see{\VRef{s:Overloading}}, making much of this discussion moot.
+Default arguments and overloading \see{\VRef{s:Overloading}} are complementary.
 While in theory default arguments can be simulated with overloading, as in:
 \begin{cquote}
 …
 Furthermore, overloading cannot handle accessing default arguments in the middle of a positional list, via a missing argument, such as:
 \begin{cfa}
 p( 1, /* default */, 5 ); §\C{// rewrite $\Rightarrow$ p( 1, 2, 5 )}§
+p( 1, /* default */, 5 ); $\C{// rewrite \(\Rightarrow\) p( 1, 2, 5 )}$
 \end{cfa}
 …
 \begin{cfa}
 struct {
         int f1; §\C{// named field}§
         int f2 : 4; §\C{// named field with bit field size}§
         int : 3; §\C{// unnamed field for basic type with bit field size}§
         int ; §\C{// disallowed, unnamed field}§
         int *; §\C{// disallowed, unnamed field}§
         int (*)( int ); §\C{// disallowed, unnamed field}§
+        int f1; $\C{// named field}$
+        int f2 : 4; $\C{// named field with bit field size}$
+        int : 3; $\C{// unnamed field for basic type with bit field size}$
+        int ; $\C{// disallowed, unnamed field}$
+        int *; $\C{// disallowed, unnamed field}$
+        int (*)( int ); $\C{// disallowed, unnamed field}$
 };
 \end{cfa}
 …
 \begin{cfa}
 struct {
         int , , ; §\C{// 3 unnamed fields}§
+        int , , ; $\C{// 3 unnamed fields}$
+}
 \end{cfa}
 …
 \subsection{Type Nesting}
 \CFA allows \Index{type nesting}, and type qualification of the nested types (see \VRef[Figure]{f:TypeNestingQualification}), where as C hoists\index{type hoisting} (refactors) nested types into the enclosing scope and has no type qualification.
+\CFA allows \Index{type nesting}, and type qualification of the nested types \see{\VRef[Figure]{f:TypeNestingQualification}}, where as C hoists\index{type hoisting} (refactors) nested types into the enclosing scope and has no type qualification.
 \begin{figure}
 \centering
 …
 int fred() {
         s.t.c = ®S.®R;  // type qualification
         struct ®S.®T t = { ®S.®R, 1, 2 };
         enum ®S.®C c;
         union ®S.T.®U u;
+        s.t.c = @S.@R;  // type qualification
+        struct @S.@T t = { @S.@R, 1, 2 };
+        enum @S.@C c;
+        union @S.T.@U u;
+}
 \end{cfa}
 …
 const unsigned int size = 5;
 int ia[size];
 ... §\C{// assign values to array ia}§
 qsort( ia, size ); §\C{// sort ascending order using builtin ?<?}§
+... $\C{// assign values to array ia}$
+qsort( ia, size ); $\C{// sort ascending order using builtin ?<?}$
+{
         ®int ?<?( int x, int y ) { return x > y; }® §\C{// nested routine}§
         qsort( ia, size ); §\C{// sort descending order by local redefinition}§
+        @int ?<?( int x, int y ) { return x > y; }@ $\C{// nested routine}$
+        qsort( ia, size ); $\C{// sort descending order by local redefinition}$
+}
 \end{cfa}
 …
 The following program in undefined in \CFA (and Indexc{gcc})
 \begin{cfa}
 [* [int]( int )] foo() { §\C{// int (* foo())( int )}§
         int ®i® = 7;
+[* [int]( int )] foo() { $\C{// int (* foo())( int )}$
+        int @i@ = 7;
         int bar( int p ) {
                 ®i® += 1; §\C{// dependent on local variable}§
                 sout | ®i®;
+                @i@ += 1; $\C{// dependent on local variable}$
+                sout | @i@;
+        }
         return bar; §\C{// undefined because of local dependence}§
+        return bar; $\C{// undefined because of local dependence}$
+}
 int main() {
         * [int]( int ) fp = foo(); §\C{// int (* fp)( int )}§
+        * [int]( int ) fp = foo(); $\C{// int (* fp)( int )}$
         sout | fp( 3 );
+}
 …
 In C and \CFA, lists of elements appear in several contexts, such as the parameter list of a routine call.
 \begin{cfa}
 f( ®2, x, 3 + i® ); §\C{// element list}§
+f( @2, x, 3 + i@ ); $\C{// element list}$
 \end{cfa}
 A list of elements is called a \newterm{tuple}, and is different from a \Index{comma expression}.
 …
 In C and most programming languages, functions return at most one value;
 however, many operations have multiple outcomes, some exceptional (see~\VRef{s:ExceptionHandling}).
+however, many operations have multiple outcomes, some exceptional \see{\VRef{s:ExceptionHandling}}.
 To emulate functions with multiple return values, \emph{\Index{aggregation}} and/or \emph{\Index{aliasing}} is used.
 …
 For example, consider C's \Indexc{div} function, which returns the quotient and remainder for a division of an integer value.
 \begin{cfa}
 typedef struct { int quot, rem; } div_t;        §\C[7cm]{// from include stdlib.h}§
+typedef struct { int quot, rem; } div_t;        $\C[7cm]{// from include stdlib.h}$
 div_t div( int num, int den );
 div_t qr = div( 13, 5 ); §\C{// return quotient/remainder aggregate}§
 printf( "%d %d\n", qr.quot, qr.rem ); §\C{// print quotient/remainder}§
+div_t qr = div( 13, 5 ); $\C{// return quotient/remainder aggregate}$
+printf( "%d %d\n", qr.quot, qr.rem ); $\C{// print quotient/remainder}$
 \end{cfa}
 This approach requires a name for the return type and fields, where \Index{naming} is a common programming-language issue.
 …
 For example, consider C's \Indexc{modf} function, which returns the integral and fractional part of a floating value.
 \begin{cfa}
 double modf( double x, double * i ); §\C{// from include math.h}§
 double intp, frac = modf( 13.5, &intp ); §\C{// return integral and fractional components}§
 printf( "%g %g\n", intp, frac ); §\C{// print integral/fractional components}§
+double modf( double x, double * i ); $\C{// from include math.h}$
+double intp, frac = modf( 13.5, &intp ); $\C{// return integral and fractional components}$
+printf( "%g %g\n", intp, frac ); $\C{// print integral/fractional components}$
 \end{cfa}
 This approach requires allocating storage for the return values, which complicates the call site with a sequence of variable declarations leading to the call.
 …
 When a function call is passed as an argument to another call, the best match of actual arguments to formal parameters is evaluated given all possible expression interpretations in the current scope.
 \begin{cfa}
 void g( int, int ); §\C{// 1}§
 void g( double, double ); §\C{// 2}§
 g( div( 13, 5 ) ); §\C{// select 1}§
 g( modf( 13.5 ) ); §\C{// select 2}§
+void g( int, int ); $\C{// 1}$
+void g( double, double ); $\C{// 2}$
+g( div( 13, 5 ) ); $\C{// select 1}$
+g( modf( 13.5 ) ); $\C{// select 2}$
 \end{cfa}
 In this case, there are two overloaded ©g© routines.
 …
 The previous examples can be rewritten passing the multiple returned-values directly to the ©printf© function call.
 \begin{cfa}
 [ int, int ] div( int x, int y ); §\C{// from include stdlib}§
 printf( "%d %d\n", div( 13, 5 ) ); §\C{// print quotient/remainder}§
 [ double, double ] modf( double x ); §\C{// from include math}§
 printf( "%g %g\n", modf( 13.5 ) ); §\C{// print integral/fractional components}§
+[ int, int ] div( int x, int y ); $\C{// from include stdlib}$
+printf( "%d %d\n", div( 13, 5 ) ); $\C{// print quotient/remainder}$
+[ double, double ] modf( double x ); $\C{// from include math}$
+printf( "%g %g\n", modf( 13.5 ) ); $\C{// print integral/fractional components}$
 \end{cfa}
 This approach provides the benefits of compile-time checking for appropriate return statements as in aggregation, but without the required verbosity of declaring a new named type.
 …
 \begin{cfa}
 int quot, rem;
 [ quot, rem ] = div( 13, 5 ); §\C{// assign multiple variables}§
 printf( "%d %d\n", quot, rem ); §\C{// print quotient/remainder}\CRT§
+[ quot, rem ] = div( 13, 5 ); $\C{// assign multiple variables}$
+printf( "%d %d\n", quot, rem ); $\C{// print quotient/remainder}\CRT$
 \end{cfa}
 Here, the multiple return-values are matched in much the same way as passing multiple return-values to multiple parameters in a call.
 …
 In \CFA, it is possible to overcome this restriction by declaring a \newterm{tuple variable}.
 \begin{cfa}
 [int, int] ®qr® = div( 13, 5 ); §\C{// initialize tuple variable}§
 printf( "%d %d\n", ®qr® ); §\C{// print quotient/remainder}§
+[int, int] @qr@ = div( 13, 5 ); $\C{// initialize tuple variable}$
+printf( "%d %d\n", @qr@ ); $\C{// print quotient/remainder}$
 \end{cfa}
 It is now possible to match the multiple return-values to a single variable, in much the same way as \Index{aggregation}.
 …
 One way to access the individual components of a tuple variable is with assignment.
 \begin{cfa}
 [ quot, rem ] = qr; §\C{// assign multiple variables}§
+[ quot, rem ] = qr; $\C{// assign multiple variables}$
 \end{cfa}
 …
 [int, double] * p;
 int y = x.0; §\C{// access int component of x}§
 y = f().1; §\C{// access int component of f}§
 p->0 = 5; §\C{// access int component of tuple pointed-to by p}§
 g( x.1, x.0 ); §\C{// rearrange x to pass to g}§
 double z = [ x, f() ].0.1; §\C{// access second component of first component of tuple expression}§
+int y = x.0; $\C{// access int component of x}$
+y = f().1; $\C{// access int component of f}$
+p->0 = 5; $\C{// access int component of tuple pointed-to by p}$
+g( x.1, x.0 ); $\C{// rearrange x to pass to g}$
+double z = [ x, f() ].0.1; $\C{// access second component of first component of tuple expression}$
 \end{cfa}
 Tuple-index expressions can occur on any tuple-typed expression, including tuple-returning functions, square-bracketed tuple expressions, and other tuple-index expressions, provided the retrieved component is also a tuple.
 …
 \subsection{Flattening and Structuring}
+\label{s:FlatteningStructuring}
 As evident in previous examples, tuples in \CFA do not have a rigid structure.
 …
 double y;
 [int, double] z;
 [y, x] = 3.14; §\C{// mass assignment}§
 [x, y] = z;                                                         §\C{// multiple assignment}§
 z = 10;                                                         §\C{// mass assignment}§
 z = [x, y]; §\C{// multiple assignment}§
+[y, x] = 3.14; $\C{// mass assignment}$
+[x, y] = z;                                                         $\C{// multiple assignment}$
+z = 10;                                                         $\C{// mass assignment}$
+z = [x, y]; $\C{// multiple assignment}$
 \end{cfa}
 Let $L_i$ for $i$ in $[0, n)$ represent each component of the flattened left side, $R_i$ represent each component of the flattened right side of a multiple assignment, and $R$ represent the right side of a mass assignment.
 …
 \begin{cfa}
 [ int, int ] x, y, z;
 [ x, y ] = z;                                              §\C{// multiple assignment, invalid 4 != 2}§
+[ x, y ] = z;                                              $\C{// multiple assignment, invalid 4 != 2}$
 \end{cfa}
 Multiple assignment assigns $R_i$ to $L_i$ for each $i$.
 …
         double c, d;
         [ void ] f( [ int, int ] );
         f( [ c, a ] = [ b, d ] = 1.5 ); §\C{// assignments in parameter list}§
+        f( [ c, a ] = [ b, d ] = 1.5 ); $\C{// assignments in parameter list}$
 \end{cfa}
 The tuple expression begins with a mass assignment of ©1.5© into ©[b, d]©, which assigns ©1.5© into ©b©, which is truncated to ©1©, and ©1.5© into ©d©, producing the tuple ©[1, 1.5]© as a result.
 …
 \begin{cfa}
 struct S;
 void ?{}(S *); §\C{// (1)}§
 void ?{}(S *, int); §\C{// (2)}§
 void ?{}(S * double); §\C{// (3)}§
 void ?{}(S *, S); §\C{// (4)}§
 [S, S] x = [3, 6.28]; §\C{// uses (2), (3), specialized constructors}§
 [S, S] y; §\C{// uses (1), (1), default constructor}§
 [S, S] z = x.0; §\C{// uses (4), (4), copy constructor}§
+void ?{}(S *); $\C{// (1)}$
+void ?{}(S *, int); $\C{// (2)}$
+void ?{}(S * double); $\C{// (3)}$
+void ?{}(S *, S); $\C{// (4)}$
+[S, S] x = [3, 6.28]; $\C{// uses (2), (3), specialized constructors}$
+[S, S] y; $\C{// uses (1), (1), default constructor}$
+[S, S] z = x.0; $\C{// uses (4), (4), copy constructor}$
 \end{cfa}
 In this example, ©x© is initialized by the multiple constructor calls ©?{}(&x.0, 3)© and ©?{}(&x.1, 6.28)©, while ©y© is initialized by two default constructor calls ©?{}(&y.0)© and ©?{}(&y.1)©.
 …
 A member-access tuple may be used anywhere a tuple can be used, \eg:
 \begin{cfa}
 s.[ y, z, x ] = [ 3, 3.2, 'x' ]; §\C{// equivalent to s.x = 'x', s.y = 3, s.z = 3.2}§
 f( s.[ y, z ] ); §\C{// equivalent to f( s.y, s.z )}§
+s.[ y, z, x ] = [ 3, 3.2, 'x' ]; $\C{// equivalent to s.x = 'x', s.y = 3, s.z = 3.2}$
+f( s.[ y, z ] ); $\C{// equivalent to f( s.y, s.z )}$
 \end{cfa}
 Note, the fields appearing in a record-field tuple may be specified in any order;
 …
 void f( double, long );
 f( x.[ 0, 3 ] ); §\C{// f( x.0, x.3 )}§
 x.[ 0, 1 ] = x.[ 1, 0 ]; §\C{// [ x.0, x.1 ] = [ x.1, x.0 ]}§
+f( x.[ 0, 3 ] ); $\C{// f( x.0, x.3 )}$
+x.[ 0, 1 ] = x.[ 1, 0 ]; $\C{// [ x.0, x.1 ] = [ x.1, x.0 ]}$
 [ long, int, long ] y = x.[ 2, 0, 2 ];
 \end{cfa}
 …
 \begin{cfa}
 [ int, float, double ] f();
 [ double, float ] x = f().[ 2, 1 ]; §\C{// f() called once}§
+[ double, float ] x = f().[ 2, 1 ]; $\C{// f() called once}$
 \end{cfa}
 …
 That is, a cast can be used to select the type of an expression when it is ambiguous, as in the call to an overloaded function.
 \begin{cfa}
 int f(); §\C{// (1)}§
 double f(); §\C{// (2)}§
 f(); §\C{// ambiguous - (1),(2) both equally viable}§
 (int)f(); §\C{// choose (2)}§
+int f(); $\C{// (1)}$
+double f(); $\C{// (2)}$
+f(); $\C{// ambiguous - (1),(2) both equally viable}$
+(int)f(); $\C{// choose (2)}$
 \end{cfa}
 Since casting is a fundamental operation in \CFA, casts need to be given a meaningful interpretation in the context of tuples.
 …
 void g();
 (void)f(); §\C{// valid, ignore results}§
 (int)g(); §\C{// invalid, void cannot be converted to int}§
+(void)f(); $\C{// valid, ignore results}$
+(int)g(); $\C{// invalid, void cannot be converted to int}$
 struct A { int x; };
 (struct A)f(); §\C{// invalid, int cannot be converted to A}§
+(struct A)f(); $\C{// invalid, int cannot be converted to A}$
 \end{cfa}
 In C, line 4 is a valid cast, which calls ©f© and discards its result.
 …
         [int, [int, int], int] g();
         ([int, double])f(); §\C{// (1) valid}§
         ([int, int, int])g(); §\C{// (2) valid}§
         ([void, [int, int]])g(); §\C{// (3) valid}§
         ([int, int, int, int])g(); §\C{// (4) invalid}§
         ([int, [int, int, int]])g(); §\C{// (5) invalid}§
+        ([int, double])f(); $\C{// (1) valid}$
+        ([int, int, int])g(); $\C{// (2) valid}$
+        ([void, [int, int]])g(); $\C{// (3) valid}$
+        ([int, int, int, int])g(); $\C{// (4) invalid}$
+        ([int, [int, int, int]])g(); $\C{// (5) invalid}$
 \end{cfa}
 …
 void f([int, int], int, int);
 f([0, 0], 0, 0); §\C{// no cost}§
 f(0, 0, 0, 0); §\C{// cost for structuring}§
 f([0, 0,], [0, 0]); §\C{// cost for flattening}§
 f([0, 0, 0], 0); §\C{// cost for flattening and structuring}§
+f([0, 0], 0, 0); $\C{// no cost}$
+f(0, 0, 0, 0); $\C{// cost for structuring}$
+f([0, 0,], [0, 0]); $\C{// cost for flattening}$
+f([0, 0, 0], 0); $\C{// cost for flattening and structuring}$
 \end{cfa}
 …
 The general syntax of a lexical list is:
 \begin{cfa}
 [ §\emph{exprlist}§ ]
+[ $\emph{exprlist}$ ]
 \end{cfa}
 where ©$\emph{exprlist}$© is a list of one or more expressions separated by commas.
 …
 Tuples are permitted to contain sub-tuples (\ie nesting), such as ©[ [ 14, 21 ], 9 ]©, which is a 2-element tuple whose first element is itself a tuple.
 Note, a tuple is not a record (structure);
 a record denotes a single value with substructure, whereas a tuple is multiple values with no substructure (see flattening coercion in Section 12.1).
+a record denotes a single value with substructure, whereas a tuple is multiple values with no substructure \see{flattening coercion in \VRef{s:FlatteningStructuring}}.
 In essence, tuples are largely a compile time phenomenon, having little or no runtime presence.
 …
 The general syntax of a tuple type is:
 \begin{cfa}
 [ §\emph{typelist}§ ]
+[ $\emph{typelist}$ ]
 \end{cfa}
 where ©$\emph{typelist}$© is a list of one or more legal \CFA or C type specifications separated by commas, which may include other tuple type specifications.
 …
 [ unsigned int, char ]
 [ double, double, double ]
 [ * int, int * ] §\C{// mix of CFA and ANSI}§
+[ * int, int * ] $\C{// mix of CFA and ANSI}$
 [ * [ 5 ] int, * * char, * [ [ int, int ] ] (int, int) ]
 \end{cfa}
 …
 Examples of declarations using tuple types are:
 \begin{cfa}
 [ int, int ] x; §\C{// 2 element tuple, each element of type int}§
 * [ char, char ] y; §\C{// pointer to a 2 element tuple}§
+[ int, int ] x; $\C{// 2 element tuple, each element of type int}$
+* [ char, char ] y; $\C{// pointer to a 2 element tuple}$
 [ [ int, int ] ] z ([ int, int ]);
 \end{cfa}
 …
 [ int, int ] w1;
 [ int, int, int ] w2;
 [ void ] f (int, int, int); §\C{// three input parameters of type int}§
 [ void ] g ([ int, int, int ]); §\C{3 element tuple as input}§
+[ void ] f (int, int, int); $\C{// three input parameters of type int}$
+[ void ] g ([ int, int, int ]); $\C{3 element tuple as input}$
 f( [ 1, 2, 3 ] );
 f( w1, 3 );
 …
 [ int, int, int, int ] w = [ 1, 2, 3, 4 ];
 int x = 5;
 [ x, w ] = [ w, x ]; §\C{// all four tuple coercions}§
+[ x, w ] = [ w, x ]; $\C{// all four tuple coercions}$
 \end{cfa}
 Starting on the right-hand tuple in the last assignment statement, w is opened, producing a tuple of four values;
 …
 This tuple is then flattened, yielding ©[ 1, 2, 3, 4, 5 ]©, which is structured into ©[ 1, [ 2, 3, 4, 5 ] ]© to match the tuple type of the left-hand side.
 The tuple ©[ 2, 3, 4, 5 ]© is then closed to create a tuple value.
 Finally, ©x© is assigned ©1© and ©w© is assigned the tuple value using multiple assignment (see Section 14).
+Finally, ©x© is assigned ©1© and ©w© is assigned the tuple value using \Index{multiple assignment} \see{\VRef{s:TupleAssignment}}.
 \begin{rationale}
 A possible additional language extension is to use the structuring coercion for tuples to initialize a complex record with a tuple.
 …
 Mass assignment has the following form:
 \begin{cfa}
 [ §\emph{lvalue}§, ... , §\emph{lvalue}§ ] = §\emph{expr}§;
+[ $\emph{lvalue}$, ... , $\emph{lvalue}$ ] = $\emph{expr}$;
 \end{cfa}
 \index{lvalue}
 …
 Multiple assignment has the following form:
 \begin{cfa}
 [ §\emph{lvalue}§, ... , §\emph{lvalue}§ ] = [ §\emph{expr}§, ... , §\emph{expr}§ ];
+[ $\emph{lvalue}$, ... , $\emph{lvalue}$ ] = [ $\emph{expr}$, ... , $\emph{expr}$ ];
 \end{cfa}
 \index{lvalue}
 …
 both these examples produce indeterminate results:
 \begin{cfa}
 f( x++, x++ ); §\C{// C routine call with side effects in arguments}§
 [ v1, v2 ] = [ x++, x++ ]; §\C{// side effects in righthand side of multiple assignment}§
+f( x++, x++ ); $\C{// C routine call with side effects in arguments}$
+[ v1, v2 ] = [ x++, x++ ]; $\C{// side effects in right-hand side of multiple assignment}$
 \end{cfa}
 …
 Cascade assignment has the following form:
 \begin{cfa}
 §\emph{tuple}§ = §\emph{tuple}§ = ... = §\emph{tuple}§;
+$\emph{tuple}$ = $\emph{tuple}$ = ... = $\emph{tuple}$;
 \end{cfa}
 and it has the same parallel semantics as for mass and multiple assignment.
 …
 \begin{cfa}
 int x = 1, y = 2, z = 3;
 sout | x ®|® y ®|® z;
+sout | x @|@ y @|@ z;
 \end{cfa}
+&
 \begin{cfa}
 cout << x ®<< " "® << y ®<< " "® << z << endl;
+cout << x @<< " "@ << y @<< " "@ << z << endl;
 \end{cfa}
+&
 …
 \\
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ® ®2® ®3
+@ @2@ @3
 \end{cfa}
+&
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ® ®2® ®3
+@ @2@ @3
 \end{cfa}
+&
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ® ®2® ®3
+@ @2@ @3
 \end{cfa}
 \end{tabular}
 …
 \begin{cfa}
 [int, [ int, int ] ] t1 = [ 1, [ 2, 3 ] ], t2 = [ 4, [ 5, 6 ] ];
 sout | t1 | t2; §\C{// print tuples}§
+sout | t1 | t2; $\C{// print tuples}$
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt]
 ®, ®2®, ®3 4®, ®5®, ®6
+@, @2@, @3 4@, @5@, @6
 \end{cfa}
 Finally, \CFA uses the logical-or operator for I/O as it is the lowest-priority \emph{overloadable} operator, other than assignment.
 …
+&
 \begin{cfa}
 sout | x * 3 | y + 1 | z << 2 | x == y | ®(®x | y®)® | ®(®x || y®)® | ®(®x > z ? 1 : 2®)®;
+sout | x * 3 | y + 1 | z << 2 | x == y | @(@x | y@)@ | @(@x || y@)@ | @(@x > z ? 1 : 2@)@;
 \end{cfa}
 \\
 …
+&
 \begin{cfa}
 cout << x * 3 << y + 1 << ®(®z << 2®)® << ®(®x == y®)® << ®(®x | y®)® << ®(®x || y®)® << ®(®x > z ? 1 : 2®)® << endl;
+cout << x * 3 << y + 1 << @(@z << 2@)@ << @(@x == y@)@ << @(@x | y@)@ << @(@x || y@)@ << @(@x > z ? 1 : 2@)@ << endl;
 \end{cfa}
 \\
 …
 \\
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
+®1® ®2.5® ®A®
+@1@ @2.5@ @A@
 …
+&
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
+®1® ®2.5® ®A®
+@1@ @2.5@ @A@
 …
+&
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
+®1®
+®2.5®
+®A®
+@1@
+@2.5@
+@A@
 \end{cfa}
 \end{tabular}
 …
 \item
+{\lstset{language=CFA,deletedelim=**[is][]{¢}{¢}}
+A separator does not appear before a C string starting with the (extended) \Index*{ASCII}\index{ASCII!extended} characters: \lstinline[basicstyle=\tt]@,.;!?)]}%¢»@, where \lstinline[basicstyle=\tt]@»@ is a closing citation mark.
+\begin{cfa}[belowskip=0pt]
+A separator does not appear before a C string starting with the (extended) \Index*{ASCII}\index{ASCII!extended} characters: \LstStringStyle{,.;!?)]\}\%\textcent\guillemotright}, where \LstStringStyle{\guillemotright} a closing citation mark.
+\begin{cfa}
 sout | 1 | ", x" | 2 | ". x" | 3 | "; x" | 4 | "! x" | 5 | "? x" | 6 | "% x"
                 | 7 | "¢ x" | 8 | "» x" | 9 | ") x" | 10 | "] x" | 11 | "} x";
 \end{cfa}
 \begin{cfa}[basicstyle=\tt,showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®,® x 2®.® x 3®;® x 4®!® x 5®?® x 6®%® x 7§\color{red}\textcent§ x 8®»® x 9®)® x 10®]® x 11®}® x
 \end{cfa}}%
 \item
 A separator does not appear after a C string ending with the (extended) \Index*{ASCII}\index{ASCII!extended} characters: \lstinline[mathescape=off,basicstyle=\tt]@([{=$£¥¡¿«@, where \lstinline[basicstyle=\tt]@¡¿@ are inverted opening exclamation and question marks, and \lstinline[basicstyle=\tt]@«@ is an opening citation mark.
+           | 7 | "$\LstStringStyle{\textcent}$ x" | 8 | "$\LstStringStyle{\guillemotright}$ x" | 9 | ") x" | 10 | "] x" | 11 | "} x";
+\end{cfa}
+\begin{cfa}[showspaces=true]
+@,@ x 2@.@ x 3@;@ x 4@!@ x 5@?@ x 6@%@ x 7$\R{\LstStringStyle{\textcent}}$ x 8$\R{\LstStringStyle{\guillemotright}}$ x 9@)@ x 10@]@ x 11@}@ x
+\end{cfa}
+\item
+A separator does not appear after a C string ending with the (extended) \Index*{ASCII}\index{ASCII!extended} characters: \LstStringStyle{([\{=\$\textsterling\textyen\textexclamdown\textquestiondown\guillemotleft}, where \LstStringStyle{\textexclamdown\textquestiondown} are inverted opening exclamation and question marks, and \LstStringStyle{\guillemotleft} is an opening citation mark.
 %$
 \begin{cfa}[mathescape=off]
 sout | "x (" | 1 | "x [" | 2 | "x {" | 3 | "x =" | 4 | "x $" | 5 | "x £" | 6 | "x ¥"
                 | 7 | "x ¡" | 8 | "x ¿" | 9 | "x «" | 10;
+\begin{cfa}
+sout | "x (" | 1 | "x [" | 2 | "x {" | 3 | "x =" | 4 | "x $" | 5 | "x $\LstStringStyle{\textsterling}$" | 6 | "x $\LstStringStyle{\textyen}$"
+           | 7 | "x $\LstStringStyle{\textexclamdown}$" | 8 | "x $\LstStringStyle{\textquestiondown}$" | 9 | "x $\LstStringStyle{\guillemotleft}$" | 10;
 \end{cfa}
 %$
 \begin{cfa}[mathescape=off,basicstyle=\tt,showspaces=true,aboveskip=0pt,belowskip=0pt]
 x ®(®1 x ®[®2 x ®{®3 x ®=®4 x ®$®5 x ®£®6 x ®¥®7 x ®¡®8 x ®¿®9 x ®«®10
+\begin{cfa}[showspaces=true]
+x @(@1 x @[@2 x @{@3 x @=@4 x $\LstStringStyle{\textdollar}$5 x $\R{\LstStringStyle{\textsterling}}$6 x $\R{\LstStringStyle{\textyen}}$7 x $\R{\LstStringStyle{\textexclamdown}}$8 x $\R{\LstStringStyle{\textquestiondown}}$9 x $\R{\LstStringStyle{\guillemotleft}}$10
 \end{cfa}
 %$
 \item
 A seperator does not appear before/after a C string starting/ending with the \Index*{ASCII} quote or whitespace characters: \lstinline[basicstyle=\tt,showspaces=true]@`'": \t\v\f\r\n@
 \begin{cfa}[belowskip=0pt]
+A seperator does not appear before/after a C string starting/ending with the \Index*{ASCII} quote or whitespace characters: \lstinline[basicstyle=\tt,showspaces=true]{`'": \t\v\f\r\n}
+\begin{cfa}
 sout | "x`" | 1 | "`x'" | 2 | "'x\"" | 3 | "\"x:" | 4 | ":x " | 5 | " x\t" | 6 | "\tx";
 \end{cfa}
 \begin{cfa}[basicstyle=\tt,showspaces=true,showtabs=true,aboveskip=0pt,belowskip=0pt]
 x®`®1®`®x§\color{red}\texttt{'}§2§\color{red}\texttt{'}§x§\color{red}\texttt{"}§3§\color{red}\texttt{"}§x®:®4®:®x® ®5® ®x®      ®6®     ®x
+\begin{cfa}[showspaces=true,showtabs=true]
+x@`@1@`@x$\R{\texttt{'}}$2$\R{\texttt{'}}$x$\R{\texttt{"}}$3$\R{\texttt{"}}$x@:@4@:@x@ @5@ @x@  @6@     @x
 \end{cfa}
 \item
 If a space is desired before or after one of the special string start/end characters, simply insert a space.
 \begin{cfa}[belowskip=0pt]
 sout | "x (§\color{red}\texttt{\textvisiblespace}§" | 1 | "§\color{red}\texttt{\textvisiblespace}§) x" | 2 | "§\color{red}\texttt{\textvisiblespace}§, x" | 3 | "§\color{red}\texttt{\textvisiblespace}§:x:§\color{red}\texttt{\textvisiblespace}§" | 4;
 \end{cfa}
 \begin{cfa}[basicstyle=\tt,showspaces=true,showtabs=true,aboveskip=0pt,belowskip=0pt]
 x (® ®1® ®) x 2® ®, x 3® ®:x:® ®4
+\begin{cfa}
+sout | "x ($\R{\texttt{\textvisiblespace}}$" | 1 | "$\R{\texttt{\textvisiblespace}}$) x" | 2 | "$\R{\texttt{\textvisiblespace}}$, x" | 3 | "$\R{\texttt{\textvisiblespace}}$:x:$\R{\texttt{\textvisiblespace}}$" | 4;
+\end{cfa}
+\begin{cfa}[showspaces=true,showtabs=true]
+x (@ @1@ @) x 2@ @, x 3@ @:x:@ @4
 \end{cfa}
 \end{enumerate}
 …
 \Indexc{sepSet}\index{manipulator!sepSet@©sepSet©} and \Indexc{sep}\index{manipulator!sep@©sep©}/\Indexc{sepGet}\index{manipulator!sepGet@©sepGet©} set and get the separator string.
 The separator string can be at most 16 characters including the ©'\0'© string terminator (15 printable characters).
 \begin{cfa}[mathescape=off,belowskip=0pt]
 sepSet( sout, ", $" ); §\C{// set separator from " " to ", \$"}§
 sout | 1 | 2 | 3 | " \"" | ®sep® | "\"";
+\begin{cfa}[escapechar=off,belowskip=0pt]
+sepSet( sout, ", $" ); $\C{// set separator from " " to ", \$"}$
+sout | 1 | 2 | 3 | " \"" | @sep@ | "\"";
 \end{cfa}
 %$
 \begin{cfa}[mathescape=off,showspaces=true,aboveskip=0pt]
 ®, $®2®, $®3 ®", $"®
+@, $@2@, $@3 @", $"@
 \end{cfa}
 %$
 \begin{cfa}[belowskip=0pt]
 sepSet( sout, " " ); §\C{// reset separator to " "}§
 sout | 1 | 2 | 3 | " \"" | ®sepGet( sout )® | "\"";
+sepSet( sout, " " ); $\C{// reset separator to " "}$
+sout | 1 | 2 | 3 | " \"" | @sepGet( sout )@ | "\"";
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt]
 ® ®2® ®3 ®" "®
+@ @2@ @3 @" "@
 \end{cfa}
 ©sepGet© can be used to store a separator and then restore it:
 \begin{cfa}[belowskip=0pt]
 char store[®sepSize®]; §\C{// sepSize is the maximum separator size}§
 strcpy( store, sepGet( sout ) ); §\C{// copy current separator}§
 sepSet( sout, "_" ); §\C{// change separator to underscore}§
+char store[@sepSize@]; $\C{// sepSize is the maximum separator size}$
+strcpy( store, sepGet( sout ) ); $\C{// copy current separator}$
+sepSet( sout, "_" ); $\C{// change separator to underscore}$
 sout | 1 | 2 | 3;
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®_®2®_®3
+@_@2@_@3
 \end{cfa}
 \begin{cfa}[belowskip=0pt]
 sepSet( sout, store ); §\C{// change separator back to original}§
+sepSet( sout, store ); $\C{// change separator back to original}$
 sout | 1 | 2 | 3;
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt]
 ® ®2® ®3
+@ @2@ @3
 \end{cfa}
 …
 The tuple separator-string can be at most 16 characters including the ©'\0'© string terminator (15 printable characters).
 \begin{cfa}[belowskip=0pt]
 sepSetTuple( sout, " " ); §\C{// set tuple separator from ", " to " "}§
 sout | t1 | t2 | " \"" | ®sepTuple® | "\"";
+sepSetTuple( sout, " " ); $\C{// set tuple separator from ", " to " "}$
+sout | t1 | t2 | " \"" | @sepTuple@ | "\"";
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt]
 2 3 4 5 6 ®" "®
+2 3 4 5 6 @" "@
 \end{cfa}
 \begin{cfa}[belowskip=0pt]
 sepSetTuple( sout, ", " ); §\C{// reset tuple separator to ", "}§
 sout | t1 | t2 | " \"" | ®sepGetTuple( sout )® | "\"";
+sepSetTuple( sout, ", " ); $\C{// reset tuple separator to ", "}$
+sout | t1 | t2 | " \"" | @sepGetTuple( sout )@ | "\"";
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt]
 , 2, 3 4, 5, 6 ®", "®
+, 2, 3 4, 5, 6 @", "@
 \end{cfa}
 As for ©sepGet©, ©sepGetTuple© can be use to store a tuple separator and then restore it.
 …
 \Indexc{sepDisable}\index{manipulator!sepDisable@©sepDisable©} and \Indexc{sepEnable}\index{manipulator!sepEnable@©sepEnable©} toggle printing the separator.
 \begin{cfa}[belowskip=0pt]
 sout | sepDisable | 1 | 2 | 3; §\C{// turn off implicit separator}§
+sout | sepDisable | 1 | 2 | 3; $\C{// turn off implicit separator}$
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 …
 \end{cfa}
 \begin{cfa}[belowskip=0pt]
 sout | sepEnable | 1 | 2 | 3; §\C{// turn on implicit separator}§
+sout | sepEnable | 1 | 2 | 3; $\C{// turn on implicit separator}$
 \end{cfa}
 \begin{cfa}[mathescape=off,showspaces=true,aboveskip=0pt,belowskip=0pt]
 …
 \Indexc{sepOn}\index{manipulator!sepOn@©sepOn©} and \Indexc{sepOff}\index{manipulator!sepOff@©sepOff©} toggle printing the separator with respect to the next printed item, and then return to the global seperator setting.
 \begin{cfa}[belowskip=0pt]
 sout | 1 | sepOff | 2 | 3; §\C{// turn off implicit separator for the next item}§
+sout | 1 | sepOff | 2 | 3; $\C{// turn off implicit separator for the next item}$
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 …
 \end{cfa}
 \begin{cfa}[belowskip=0pt]
 sout | sepDisable | 1 | sepOn | 2 | 3; §\C{// turn on implicit separator for the next item}§
+sout | sepDisable | 1 | sepOn | 2 | 3; $\C{// turn on implicit separator for the next item}$
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 …
 The tuple separator also responses to being turned on and off.
 \begin{cfa}[belowskip=0pt]
 sout | t1 | sepOff | t2; §\C{// turn off implicit separator for the next item}§
+sout | t1 | sepOff | t2; $\C{// turn off implicit separator for the next item}$
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 …
 use ©sep© to accomplish this functionality.
 \begin{cfa}[belowskip=0pt]
 sout | sepOn | 1 | 2 | 3 | sepOn; §\C{// sepOn does nothing at start/end of line}§
+sout | sepOn | 1 | 2 | 3 | sepOn; $\C{// sepOn does nothing at start/end of line}$
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 …
 \end{cfa}
 \begin{cfa}[belowskip=0pt]
 sout | sep | 1 | 2 | 3 | sep ; §\C{// use sep to print separator at start/end of line}§
+sout | sep | 1 | 2 | 3 | sep ; $\C{// use sep to print separator at start/end of line}$
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
+® ®1 2 3® ®
+@ @1 2 3@ @
 \end{cfa}
 \end{enumerate}
 …
 \begin{enumerate}[parsep=0pt]
 \item
 \Indexc{nl}\index{manipulator!nl@©nl©} scans characters until the next newline character, i.e., ignore the remaining characters in the line.
+\Indexc{nl}\index{manipulator!nl@©nl©} scans characters until the next newline character, \ie ignore the remaining characters in the line.
 \item
 \Indexc{nlOn}\index{manipulator!nlOn@©nlOn©} reads the newline character, when reading single characters.
 …
 For example, in:
 \begin{cfa}
 sin | i | ®nl® | j;
 ®2®
+sin | i | @nl@ | j;
+@2@
 \end{cfa}
 …
 \Indexc{nl}\index{manipulator!nl@©nl©} inserts a newline.
 \begin{cfa}
 sout | nl; §\C{// only print newline}§
 sout | 2; §\C{// implicit newline}§
 sout | 3 | nl | 4 | nl; §\C{// terminating nl merged with implicit newline}§
 sout | 5 | nl | nl; §\C{// again terminating nl merged with implicit newline}§
 sout | 6; §\C{// implicit newline}§
+sout | nl; $\C{// only print newline}$
+sout | 2; $\C{// implicit newline}$
+sout | 3 | nl | 4 | nl; $\C{// terminating nl merged with implicit newline}$
+sout | 5 | nl | nl; $\C{// again terminating nl merged with implicit newline}$
+sout | 6; $\C{// implicit newline}$
 …
 b0 0b11011 0b11011 0b11011 0b11011
 sout | bin( -27HH ) | bin( -27H ) | bin( -27 ) | bin( -27L );
 b11100101 0b1111111111100101 0b11111111111111111111111111100101 0b®(58 1s)®100101
+b11100101 0b1111111111100101 0b11111111111111111111111111100101 0b@(58 1s)@100101
 \end{cfa}
 …
 \begin{cfa}[belowskip=0pt]
 sout | upcase( bin( 27 ) ) | upcase( hex( 27 ) ) | upcase( 27.5e-10 ) | upcase( hex( 27.5 ) );
 ®B®11011 0®X®1®B® 2.75®E®-09 0®X®1.®B®8®P®+4
+@B@11011 0@X@1@B@ 2.75@E@-09 0@X@1.@B@8@P@+4
 \end{cfa}
 …
 \begin{cfa}[belowskip=0pt]
 sout | 0. | nodp( 0. ) | 27.0 | nodp( 27.0 ) | nodp( 27.5 );
 .0 ®0® 27.0 ®27® 27.5
+.0 @0@ 27.0 @27@ 27.5
 \end{cfa}
 …
 \begin{cfa}[belowskip=0pt]
 sout | sign( 27 ) | sign( -27 ) | sign( 27. ) | sign( -27. ) | sign( 27.5 ) | sign( -27.5 );
 ®+®27 -27 ®+®27.0 -27.0 ®+®27.5 -27.5
+@+@27 -27 @+@27.0 -27.0 @+@27.5 -27.5
 \end{cfa}
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®  ®34 ® ®34 34
 ®  ®4.000000 ® ®4.000000 4.000000
 ®  ®ab ® ®ab ab
+@  @34 @ @34 34
+@  @4.000000 @ @4.000000 4.000000
+@  @ab @ @ab ab
 \end{cfa}
 If the value is larger, it is printed without truncation, ignoring the ©minimum©.
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®7® 345®67® 34®567®
 ®.® 345®6.® 34®56.®
 abcd®e® abc®de® ab®cde®
+@7@ 345@67@ 34@567@
+@.@ 345@6.@ 34@56.@
+abcd@e@ abc@de@ ab@cde@
 \end{cfa}
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
  ®0®34     ®00®34 ®00000000®34
+ @0@34     @00@34 @00000000@34
 \end{cfa}
 If the value is larger, it is printed without truncation, ignoring the ©precision©.
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®    ® ®00000000®34
+@    @ @00000000@34
 \end{cfa}
 For floating-point types, ©precision© is the minimum number of digits after the decimal point.
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 .®500®     27.®5®      28. 27.®50000000®
 \end{cfa}
 For the C-string type, ©precision© is the maximum number of printed characters, so the string is truncared if it exceeds the maximum.
+.@500@     27.@5@      28. 27.@50000000@
+\end{cfa}
+For the C-string type, ©precision© is the maximum number of printed characters, so the string is truncated if it exceeds the maximum.
 \begin{cfa}[belowskip=0pt]
 sout | wd( 6,8, "abcd" ) | wd( 6,8, "abcdefghijk" ) | wd( 6,3, "abcd" );
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 .567 234.5®7®  234.®6®    23®5®
+.567 234.5@7@  234.@6@    23@5@
 \end{cfa}
 If a value's magnitude is greater than ©significant©, the value is printed in scientific notation with the specified number of significant digits.
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 . 2.3457®e+05® 2.346®e+05® 2.35®e+05®
+. 2.3457@e+05@ 2.346@e+05@ 2.35@e+05@
 \end{cfa}
 If ©significant© is greater than ©minimum©, it defines the number of printed characters.
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®  ® 27.000000  27.500000  027  27.500®    ®
+@  @ 27.000000  27.500000  027  27.500@    @
 \end{cfa}
 …
 \begin{cfa}[belowskip=0pt]
 sout | pad0( wd( 4, 27 ) ) | pad0( wd( 4,3, 27 ) ) | pad0( wd( 8,3, 27.5 ) );
 ®00®27  ®0®27 ®00®27.500
+@00@27  @0@27 @00@27.500
 \end{cfa}
 \end{enumerate}
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
+®abc   ®
+®abc  ®
+®xx®
+@abc   @
+@abc  @
+@xx@
 \end{cfa}
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
+®abcd1233.456E+2®
+@abcd1233.456E+2@
 \end{cfa}
 Note, input ©wdi© cannot be overloaded with output ©wd© because both have the same parameters but return different types.
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®  -75.35e-4® 25
+@  -75.35e-4@ 25
 \end{cfa}
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®bca®xyz
+@bca@xyz
 \end{cfa}
 …
 \end{cfa}
 \begin{cfa}[showspaces=true,aboveskip=0pt,belowskip=0pt]
 ®xyz®bca
+@xyz@bca
 \end{cfa}
 \end{enumerate}
 …
 A type definition is different from a typedef in C because a typedef just creates an alias for a type,  while Do.s type definition creates a distinct type.
 This means that users can define distinct function overloads for the new type (see Overloading for more information).
+This means that users can define distinct function overloads for the new type \see{\VRef{s:Overloading} for more information}.
 For example:
 …
 \CFA supports C initialization of structures, but it also adds constructors for more advanced initialization.
 Additionally, \CFA adds destructors that are called when a variable is deallocated (variable goes out of scope or object is deleted).
 These functions take a reference to the structure as a parameter (see References for more information).
+These functions take a reference to the structure as a parameter \see{\VRef{s:PointerReference} for more information}.
 \begin{figure}
 …
 \section{Overloading}
+\label{s:Overloading}
 Overloading refers to the capability of a programmer to define and use multiple objects in a program with the same name.
 …
 \subsection{Overloaded Constant}
+\subsection{Constant}
 The constants 0 and 1 have special meaning.
 …
+\subsection{Variable Overloading}
+\subsection{Variable}
+\label{s:VariableOverload}
 The overload rules of \CFA allow a programmer to define multiple variables with the same name, but different types.
 …
 \subsection{Operator Overloading}
+\subsection{Operator}
 \CFA also allows operators to be overloaded, to simplify the use of user-defined types.
 …
 For example, given
 \begin{cfa}
 auto j = ®...®
+auto j = @...@
 \end{cfa}
 and the need to write a routine to compute using ©j©
 \begin{cfa}
 void rtn( ®...® parm );
+void rtn( @...@ parm );
 rtn( j );
 \end{cfa}
 …
 coroutine Fibonacci {
         int fn; §\C{// used for communication}§
+        int fn; $\C{// used for communication}$
 };
 void ?{}( Fibonacci * this ) {
 …
+}
 void main( Fibonacci * this ) {
         int fn1, fn2; §\C{// retained between resumes}§
         this->fn = 0; §\C{// case 0}§
+        int fn1, fn2; $\C{// retained between resumes}$
+        this->fn = 0; $\C{// case 0}$
         fn1 = this->fn;
         suspend(); §\C{// return to last resume}§
         this->fn = 1; §\C{// case 1}§
+        suspend(); $\C{// return to last resume}$
+        this->fn = 1; $\C{// case 1}$
         fn2 = fn1;
         fn1 = this->fn;
         suspend(); §\C{// return to last resume}§
         for ( ;; ) { §\C{// general case}§
+        suspend(); $\C{// return to last resume}$
+        for ( ;; ) { $\C{// general case}$
                 this->fn = fn1 + fn2;
                 fn2 = fn1;
                 fn1 = this->fn;
                 suspend(); §\C{// return to last resume}§
+                suspend(); $\C{// return to last resume}$
         } // for
+}
 int next( Fibonacci * this ) {
         resume( this ); §\C{// transfer to last suspend}§
+        resume( this ); $\C{// transfer to last suspend}$
         return this->fn;
+}
 …
 When building a \CFA module which needs to be callable from C code, users can use the tools to generate a header file suitable for including in these C files with all of the needed declarations.
 In order to interoperate with existing C code, \CFA files can still include header files, the contents of which will be enclosed in a C linkage section to indicate C calling conventions (see Interoperability for more information).
+In order to interoperate with existing C code, \CFA files can still include header files, the contents of which will be enclosed in a C linkage section to indicate C calling conventions \see{\VRef{s:Interoperability} for more information}.
 …
 \end{cfa}
+&
 \begin{lstlisting}[language=C++]
+\begin{C++}
 class Line {
         float lnth;
 …
 Line line1;
 Line line2( 3.4 );
 \end{lstlisting}
+\end{C++}
+&
 \begin{lstlisting}[language=Golang]
 …
 In \CFA, there are ambiguous cases with dereference and operator identifiers, \eg ©int *?*?()©, where the string ©*?*?© can be interpreted as:
 \begin{cfa}
 *?§\color{red}\textvisiblespace§*? §\C{// dereference operator, dereference operator}§
 *§\color{red}\textvisiblespace§?*? §\C{// dereference, multiplication operator}§
+*?$\R{\textvisiblespace}$*? $\C{// dereference operator, dereference operator}$
+*$\R{\textvisiblespace}$?*? $\C{// dereference, multiplication operator}$
 \end{cfa}
 By default, the first interpretation is selected, which does not yield a meaningful parse.
 …
 The ambiguity occurs when the deference operator has no parameters:
 \begin{cfa}
 *?()§\color{red}\textvisiblespace...§ ;
 *?()§\color{red}\textvisiblespace...§(...) ;
+*?()$\R{\textvisiblespace...}$ ;
+*?()$\R{\textvisiblespace...}$(...) ;
 \end{cfa}
 requiring arbitrary whitespace look-ahead for the routine-call parameter-list to disambiguate.
 …
 The remaining cases are with the increment/decrement operators and conditional expression, \eg:
 \begin{cfa}
 i++?§\color{red}\textvisiblespace...§(...);
 i?++§\color{red}\textvisiblespace...§(...);
+i++?$\R{\textvisiblespace...}$(...);
+i?++$\R{\textvisiblespace...}$(...);
 \end{cfa}
 requiring arbitrary whitespace look-ahead for the operator parameter-list, even though that interpretation is an incorrect expression (juxtaposed identifiers).
 Therefore, it is necessary to disambiguate these cases with a space:
 \begin{cfa}
 i++§\color{red}\textvisiblespace§? i : 0;
 i?§\color{red}\textvisiblespace§++i : 0;
+i++$\R{\textvisiblespace}$? i : 0;
+i?$\R{\textvisiblespace}$++i : 0;
 \end{cfa}
 …
 \begin{description}
 \item[Change:] add new keywords \\
 New keywords are added to \CFA (see~\VRef{s:CFAKeywords}).
+New keywords are added to \CFA \see{\VRef{s:CFAKeywords}}.
 \item[Rationale:] keywords added to implement new semantics of \CFA.
 \item[Effect on original feature:] change to semantics of well-defined feature. \\
 Any \Celeven programs using these keywords as identifiers are invalid \CFA programs.
 \item[Difficulty of converting:] keyword clashes are accommodated by syntactic transformations using the \CFA backquote escape-mechanism (see~\VRef{s:BackquoteIdentifiers}).
+\item[Difficulty of converting:] keyword clashes are accommodated by syntactic transformations using the \CFA backquote escape-mechanism \see{\VRef{s:BackquoteIdentifiers}}.
 \item[How widely used:] clashes among new \CFA keywords and existing identifiers are rare.
 \end{description}
 …
 \eg:
 \begin{cfa}
 x; §\C{// int x}§
 *y; §\C{// int *y}§
 f( p1, p2 ); §\C{// int f( int p1, int p2 );}§
 g( p1, p2 ) int p1, p2; §\C{// int g( int p1, int p2 );}§
+x; $\C{// int x}$
+*y; $\C{// int *y}$
+f( p1, p2 ); $\C{// int f( int p1, int p2 );}$
+g( p1, p2 ) int p1, p2; $\C{// int g( int p1, int p2 );}$
 \end{cfa}
 \CFA continues to support K\&R routine definitions:
 \begin{cfa}
 f( a, b, c ) §\C{// default int return}§
         int a, b; char c §\C{// K\&R parameter declarations}§
+f( a, b, c ) $\C{// default int return}$
+        int a, b; char c $\C{// K\&R parameter declarations}$
+{
         ...
 …
 int rtn( int i );
 int rtn( char c );
 rtn( 'x' ); §\C{// programmer expects 2nd rtn to be called}§
+rtn( 'x' ); $\C{// programmer expects 2nd rtn to be called}$
 \end{cfa}
 \item[Rationale:] it is more intuitive for the call to ©rtn© to match the second version of definition of ©rtn© rather than the first.
 …
 \item[Change:] make string literals ©const©:
 \begin{cfa}
 char * p = "abc"; §\C{// valid in C, deprecated in \CFA}§
 char * q = expr ? "abc" : "de"; §\C{// valid in C, invalid in \CFA}§
+char * p = "abc"; $\C{// valid in C, deprecated in \CFA}$
+char * q = expr ? "abc" : "de"; $\C{// valid in C, invalid in \CFA}$
 \end{cfa}
 The type of a string literal is changed from ©[] char© to ©const [] char©.
 …
 \begin{cfa}
 char * p = "abc";
 p[0] = 'w'; §\C{// segment fault or change constant literal}§
+p[0] = 'w'; $\C{// segment fault or change constant literal}$
 \end{cfa}
 The same problem occurs when passing a string literal to a routine that changes its argument.
 …
 \item[Change:] remove \newterm{tentative definitions}, which only occurs at file scope:
 \begin{cfa}
 int i; §\C{// forward definition}§
 int *j = ®&i®; §\C{// forward reference, valid in C, invalid in \CFA}§
 int i = 0; §\C{// definition}§
+int i; $\C{// forward definition}$
+int *j = @&i@; $\C{// forward reference, valid in C, invalid in \CFA}$
+int i = 0; $\C{// definition}$
 \end{cfa}
 is valid in C, and invalid in \CFA because duplicate overloaded object definitions at the same scope level are disallowed.
 …
 \begin{cfa}
 struct X { int i; struct X *next; };
 static struct X a; §\C{// forward definition}§
 static struct X b = { 0, ®&a® };§\C{// forward reference, valid in C, invalid in \CFA}§
 static struct X a = { 1, &b }; §\C{// definition}§
+static struct X a; $\C{// forward definition}$
+static struct X b = { 0, @&a@ };$\C{// forward reference, valid in C, invalid in \CFA}$
+static struct X a = { 1, &b }; $\C{// definition}$
 \end{cfa}
 \item[Rationale:] avoids having different initialization rules for builtin types and user-defined types.
 …
 \item[Change:] have ©struct© introduce a scope for nested types:
 \begin{cfa}
 enum ®Colour® { R, G, B, Y, C, M };
+enum @Colour@ { R, G, B, Y, C, M };
 struct Person {
         enum ®Colour® { R, G, B };      §\C[7cm]{// nested type}§
         struct Face { §\C{// nested type}§
                 ®Colour® Eyes, Hair; §\C{// type defined outside (1 level)}§
+        enum @Colour@ { R, G, B };      $\C[7cm]{// nested type}$
+        struct Face { $\C{// nested type}$
+                @Colour@ Eyes, Hair; $\C{// type defined outside (1 level)}$
         };
         ®.Colour® shirt; §\C{// type defined outside (top level)}§
         ®Colour® pants; §\C{// type defined same level}§
         Face looks[10]; §\C{// type defined same level}§
+        @.Colour@ shirt; $\C{// type defined outside (top level)}$
+        @Colour@ pants; $\C{// type defined same level}$
+        Face looks[10]; $\C{// type defined same level}$
 };
+®Colour® c = R; §\C{// type/enum defined same level}§
 Person®.Colour® pc = Person®.®R;§\C{// type/enum defined inside}§
 Person®.®Face pretty; §\C{// type defined inside}\CRT§
+@Colour@ c = R; $\C{// type/enum defined same level}$
+Person@.Colour@ pc = Person@.@R;$\C{// type/enum defined inside}$
+Person@.@Face pretty; $\C{// type defined inside}\CRT$
 \end{cfa}
 In C, the name of the nested types belongs to the same scope as the name of the outermost enclosing structure, \ie the nested types are hoisted to the scope of the outer-most type, which is not useful and confusing.
 …
 \item[Difficulty of converting:] Semantic transformation. To make the struct type name visible in the scope of the enclosing struct, the struct tag could be declared in the scope of the enclosing struct, before the enclosing struct is defined. Example:
 \begin{cfa}
 struct Y; §\C{// struct Y and struct X are at the same scope}§
+struct Y; $\C{// struct Y and struct X are at the same scope}$
 struct X {
         struct Y { /* ... */ } y;
 …
 \begin{cfa}
 void foo() {
         int * b = malloc( sizeof(int) ); §\C{// implicitly convert void * to int *}§
         char * c = b; §\C{// implicitly convert int * to void *, and then void * to char *}§
+        int * b = malloc( sizeof(int) ); $\C{// implicitly convert void * to int *}$
+        char * c = b; $\C{// implicitly convert int * to void *, and then void * to char *}$
+}
 \end{cfa}
 \item[Rationale:] increase type safety
 \item[Effect on original feature:] deletion of semantically well-defined feature.
 \item[Difficulty of converting:] requires adding a cast (see \VRef{s:StorageManagement} for better alternatives):
+\item[Difficulty of converting:] requires adding a cast \see{\VRef{s:StorageManagement} for better alternatives}:
 \begin{cfa}
         int * b = (int *)malloc( sizeof(int) );
 …
 \end{cquote}
 For the prescribed head-files, \CFA uses header interposition to wraps these includes in an ©extern "C"©;
 hence, names in these include files are not mangled\index{mangling!name} (see~\VRef{s:Interoperability}).
+hence, names in these include files are not mangled\index{mangling!name} \see{\VRef{s:Interoperability}}.
 All other C header files must be explicitly wrapped in ©extern "C"© to prevent name mangling.
 This approach is different from \Index*[C++]{\CC{}} where the name-mangling issue is handled internally in C header-files through checks for preprocessor variable ©__cplusplus©, which adds appropriate ©extern "C"© qualifiers.
 …
 Type-safe allocation is provided for all C allocation routines and new \CFA allocation routines, \eg in
 \begin{cfa}
 int * ip = (int *)malloc( sizeof(int) );                §\C{// C}§
 int * ip = malloc();                                                    §\C{// \CFA type-safe version of C malloc}§
 int * ip = alloc();                                                             §\C{// \CFA type-safe uniform alloc}§
+int * ip = (int *)malloc( sizeof(int) );                $\C{// C}$
+int * ip = malloc();                                                    $\C{// \CFA type-safe version of C malloc}$
+int * ip = alloc();                                                             $\C{// \CFA type-safe uniform alloc}$
 \end{cfa}
 the latter two allocations determine the allocation size from the type of ©p© (©int©) and cast the pointer to the allocated storage to ©int *©.
 …
 \begin{cfa}
 struct S { int i; } __attribute__(( aligned( 128 ) )); // cache-line alignment
 S * sp = malloc();                                                              §\C{// honour type alignment}§
+S * sp = malloc();                                                              $\C{// honour type alignment}$
 \end{cfa}
 the storage allocation is implicitly aligned to 128 rather than the default 16.
 …
 \CFA memory management extends allocation to support constructors for initialization of allocated storage, \eg in
 \begin{cfa}
 struct S { int i; };                                                    §\C{// cache-line aglinment}§
+struct S { int i; };                                                    $\C{// cache-line alignment}$
 void ?{}( S & s, int i ) { s.i = i; }
 // assume ?|? operator for printing an S
 S & sp = *®new®( 3 );                                                   §\C{// call constructor after allocation}§
+S & sp = *@new@( 3 );                                                   $\C{// call constructor after allocation}$
 sout | sp.i;
 ®delete®( &sp );
 S * spa = ®anew®( 10, 5 );                                              §\C{// allocate array and initialize each array element}§
+@delete@( &sp );
+S * spa = @anew@( 10, 5 );                                              $\C{// allocate array and initialize each array element}$
 for ( i; 10 ) sout | spa[i] | nonl;
 sout | nl;
 ®adelete®( 10, spa );
+@adelete@( 10, spa );
 \end{cfa}
 Allocation routines ©new©/©anew© allocate a variable/array and initialize storage using the allocated type's constructor.
 …
 extern "C" {
         // C unsafe allocation
         void * malloc( size_t size );§\indexc{malloc}§
         void * calloc( size_t dim, size_t size );§\indexc{calloc}§
         void * realloc( void * ptr, size_t size );§\indexc{realloc}§
         void * memalign( size_t align, size_t size );§\indexc{memalign}§
         void * aligned_alloc( size_t align, size_t size );§\indexc{aligned_alloc}§
         int posix_memalign( void ** ptr, size_t align, size_t size );§\indexc{posix_memalign}§
         void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize );§\indexc{cmemalign}§ // CFA
+        void * malloc( size_t size );$\indexc{malloc}$
+        void * calloc( size_t dim, size_t size );$\indexc{calloc}$
+        void * realloc( void * ptr, size_t size );$\indexc{realloc}$
+        void * memalign( size_t align, size_t size );$\indexc{memalign}$
+        void * aligned_alloc( size_t align, size_t size );$\indexc{aligned_alloc}$
+        int posix_memalign( void ** ptr, size_t align, size_t size );$\indexc{posix_memalign}$
+        void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize );$\indexc{cmemalign}$ // CFA
         // C unsafe initialization/copy
         void * memset( void * dest, int c, size_t size );§\indexc{memset}§
         void * memcpy( void * dest, const void * src, size_t size );§\indexc{memcpy}§
+        void * memset( void * dest, int c, size_t size );$\indexc{memset}$
+        void * memcpy( void * dest, const void * src, size_t size );$\indexc{memcpy}$
+}
 …
 forall( dtype T | sized(T) ) {
         // §\CFA§ safe equivalents, i.e., implicit size specification
+        // $\CFA$ safe equivalents, i.e., implicit size specification
         T * malloc( void );
         T * calloc( size_t dim );
 …
         int posix_memalign( T ** ptr, size_t align );
         // §\CFA§ safe general allocation, fill, resize, alignment, array
         T * alloc( void );§\indexc{alloc}§                                      §\C[3.5in]{// variable, T size}§
         T * alloc( size_t dim );                                                        §\C{// array[dim], T size elements}§
         T * alloc( T ptr[], size_t dim );                                       §\C{// realloc array[dim], T size elements}§
         T * alloc_set( char fill );§\indexc{alloc_set}§         §\C{// variable, T size, fill bytes with value}§
         T * alloc_set( T fill );                                                        §\C{// variable, T size, fill with value}§
         T * alloc_set( size_t dim, char fill );                         §\C{// array[dim], T size elements, fill bytes with value}§
         T * alloc_set( size_t dim, T fill );                            §\C{// array[dim], T size elements, fill elements with value}§
         T * alloc_set( size_t dim, const T fill[] );            §\C{// array[dim], T size elements, fill elements with array}§
         T * alloc_set( T ptr[], size_t dim, char fill );        §\C{// realloc array[dim], T size elements, fill bytes with value}§
         T * alloc_align( size_t align );                                        §\C{// aligned variable, T size}§
         T * alloc_align( size_t align, size_t dim );            §\C{// aligned array[dim], T size elements}§
         T * alloc_align( T ptr[], size_t align );                       §\C{// realloc new aligned array}§
         T * alloc_align( T ptr[], size_t align, size_t dim ); §\C{// realloc new aligned array[dim]}§
         T * alloc_align_set( size_t align, char fill );         §\C{// aligned variable, T size, fill bytes with value}§
         T * alloc_align_set( size_t align, T fill );            §\C{// aligned variable, T size, fill with value}§
         T * alloc_align_set( size_t align, size_t dim, char fill ); §\C{// aligned array[dim], T size elements, fill bytes with value}§
         T * alloc_align_set( size_t align, size_t dim, T fill ); §\C{// aligned array[dim], T size elements, fill elements with value}§
         T * alloc_align_set( size_t align, size_t dim, const T fill[] ); §\C{// aligned array[dim], T size elements, fill elements with array}§
         T * alloc_align_set( T ptr[], size_t align, size_t dim, char fill ); §\C{// realloc new aligned array[dim], fill new bytes with value}§
         // §\CFA§ safe initialization/copy, i.e., implicit size specification
         T * memset( T * dest, char fill );§\indexc{memset}§
         T * memcpy( T * dest, const T * src );§\indexc{memcpy}§
         // §\CFA§ safe initialization/copy, i.e., implicit size specification, array types
+        // $\CFA$ safe general allocation, fill, resize, alignment, array
+        T * alloc( void );$\indexc{alloc}$                                      $\C[3.5in]{// variable, T size}$
+        T * alloc( size_t dim );                                                        $\C{// array[dim], T size elements}$
+        T * alloc( T ptr[], size_t dim );                                       $\C{// realloc array[dim], T size elements}$
+        T * alloc_set( char fill );$\indexc{alloc_set}$         $\C{// variable, T size, fill bytes with value}$
+        T * alloc_set( T fill );                                                        $\C{// variable, T size, fill with value}$
+        T * alloc_set( size_t dim, char fill );                         $\C{// array[dim], T size elements, fill bytes with value}$
+        T * alloc_set( size_t dim, T fill );                            $\C{// array[dim], T size elements, fill elements with value}$
+        T * alloc_set( size_t dim, const T fill[] );            $\C{// array[dim], T size elements, fill elements with array}$
+        T * alloc_set( T ptr[], size_t dim, char fill );        $\C{// realloc array[dim], T size elements, fill bytes with value}$
+        T * alloc_align( size_t align );                                        $\C{// aligned variable, T size}$
+        T * alloc_align( size_t align, size_t dim );            $\C{// aligned array[dim], T size elements}$
+        T * alloc_align( T ptr[], size_t align );                       $\C{// realloc new aligned array}$
+        T * alloc_align( T ptr[], size_t align, size_t dim ); $\C{// realloc new aligned array[dim]}$
+        T * alloc_align_set( size_t align, char fill );         $\C{// aligned variable, T size, fill bytes with value}$
+        T * alloc_align_set( size_t align, T fill );            $\C{// aligned variable, T size, fill with value}$
+        T * alloc_align_set( size_t align, size_t dim, char fill ); $\C{// aligned array[dim], T size elements, fill bytes with value}$
+        T * alloc_align_set( size_t align, size_t dim, T fill ); $\C{// aligned array[dim], T size elements, fill elements with value}$
+        T * alloc_align_set( size_t align, size_t dim, const T fill[] ); $\C{// aligned array[dim], T size elements, fill elements with array}$
+        T * alloc_align_set( T ptr[], size_t align, size_t dim, char fill ); $\C{// realloc new aligned array[dim], fill new bytes with value}$
+        // $\CFA$ safe initialization/copy, i.e., implicit size specification
+        T * memset( T * dest, char fill );$\indexc{memset}$
+        T * memcpy( T * dest, const T * src );$\indexc{memcpy}$
+        // $\CFA$ safe initialization/copy, i.e., implicit size specification, array types
         T * amemset( T dest[], char fill, size_t dim );
         T * amemcpy( T dest[], const T src[], size_t dim );
+}
 // §\CFA§ allocation/deallocation and constructor/destructor, non-array types
 forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * new( Params p );§\indexc{new}§
 forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void delete( T * ptr );§\indexc{delete}§
+// $\CFA$ allocation/deallocation and constructor/destructor, non-array types
+forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * new( Params p );$\indexc{new}$
+forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void delete( T * ptr );$\indexc{delete}$
 forall( dtype T, ttype Params | sized(T) | { void ^?{}( T & ); void delete( Params ); } )
   void delete( T * ptr, Params rest );
 // §\CFA§ allocation/deallocation and constructor/destructor, array types
 forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * anew( size_t dim, Params p );§\indexc{anew}§
 forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( size_t dim, T arr[] );§\indexc{adelete}§
+// $\CFA$ allocation/deallocation and constructor/destructor, array types
+forall( dtype T | sized(T), ttype Params | { void ?{}( T &, Params ); } ) T * anew( size_t dim, Params p );$\indexc{anew}$
+forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( size_t dim, T arr[] );$\indexc{adelete}$
 forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype Params | { void adelete( Params ); } )
   void adelete( size_t dim, T arr[], Params rest );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 int ato( const char * ptr );§\indexc{ato}§
+int ato( const char * ptr );$\indexc{ato}$
 unsigned int ato( const char * ptr );
 long int ato( const char * ptr );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 forall( otype T | { int ?<?( T, T ); } ) §\C{// location}§
 T * bsearch( T key, const T * arr, size_t dim );§\indexc{bsearch}§
 forall( otype T | { int ?<?( T, T ); } ) §\C{// position}§
+forall( otype T | { int ?<?( T, T ); } ) $\C{// location}$
+T * bsearch( T key, const T * arr, size_t dim );$\indexc{bsearch}$
+forall( otype T | { int ?<?( T, T ); } ) $\C{// position}$
 unsigned int bsearch( T key, const T * arr, size_t dim );
 forall( otype T | { int ?<?( T, T ); } )
 void qsort( const T * arr, size_t dim );§\indexc{qsort}§
+void qsort( const T * arr, size_t dim );$\indexc{qsort}$
 forall( otype E | { int ?<?( E, E ); } ) {
         E * bsearch( E key, const E * vals, size_t dim );§\indexc{bsearch}§ §\C{// location}§
         size_t bsearch( E key, const E * vals, size_t dim );§\C{// position}§
         E * bsearchl( E key, const E * vals, size_t dim );§\indexc{bsearchl}§
+        E * bsearch( E key, const E * vals, size_t dim );$\indexc{bsearch}$ $\C{// location}$
+        size_t bsearch( E key, const E * vals, size_t dim );$\C{// position}$
+        E * bsearchl( E key, const E * vals, size_t dim );$\indexc{bsearchl}$
         size_t bsearchl( E key, const E * vals, size_t dim );
         E * bsearchu( E key, const E * vals, size_t dim );§\indexc{bsearchu}§
+        E * bsearchu( E key, const E * vals, size_t dim );$\indexc{bsearchu}$
         size_t bsearchu( E key, const E * vals, size_t dim );
+}
 …
 forall( otype E | { int ?<?( E, E ); } ) {
         void qsort( E * vals, size_t dim );§\indexc{qsort}§
+        void qsort( E * vals, size_t dim );$\indexc{qsort}$
+}
 \end{cfa}
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 unsigned char abs( signed char );§\indexc{abs}§
+unsigned char abs( signed char );$\indexc{abs}$
 int abs( int );
 unsigned long int abs( long int );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 void srandom( unsigned int seed );§\indexc{srandom}§
 char random( void );§\indexc{random}§
 char random( char u ); §\C{// [0,u)}§
 char random( char l, char u ); §\C{// [l,u)}§
+void srandom( unsigned int seed );$\indexc{srandom}$
+char random( void );$\indexc{random}$
+char random( char u ); $\C{// [0,u)}$
+char random( char l, char u ); $\C{// [l,u)}$
 int random( void );
 int random( int u ); §\C{// [0,u)}§
 int random( int l, int u ); §\C{// [l,u)}§
+int random( int u ); $\C{// [0,u)}$
+int random( int l, int u ); $\C{// [l,u)}$
 unsigned int random( void );
 unsigned int random( unsigned int u ); §\C{// [0,u)}§
 unsigned int random( unsigned int l, unsigned int u ); §\C{// [l,u)}§
+unsigned int random( unsigned int u ); $\C{// [0,u)}$
+unsigned int random( unsigned int l, unsigned int u ); $\C{// [l,u)}$
 long int random( void );
 long int random( long int u ); §\C{// [0,u)}§
 long int random( long int l, long int u ); §\C{// [l,u)}§
+long int random( long int u ); $\C{// [0,u)}$
+long int random( long int l, long int u ); $\C{// [l,u)}$
 unsigned long int random( void );
 unsigned long int random( unsigned long int u ); §\C{// [0,u)}§
 unsigned long int random( unsigned long int l, unsigned long int u ); §\C{// [l,u)}§
 float random( void );                                            §\C{// [0.0, 1.0)}§
 double random( void );                                           §\C{// [0.0, 1.0)}§
 float _Complex random( void );                           §\C{// [0.0, 1.0)+[0.0, 1.0)i}§
 double _Complex random( void );                          §\C{// [0.0, 1.0)+[0.0, 1.0)i}§
 long double _Complex random( void );             §\C{// [0.0, 1.0)+[0.0, 1.0)i}§
+unsigned long int random( unsigned long int u ); $\C{// [0,u)}$
+unsigned long int random( unsigned long int l, unsigned long int u ); $\C{// [l,u)}$
+float random( void );                                            $\C{// [0.0, 1.0)}$
+double random( void );                                           $\C{// [0.0, 1.0)}$
+float _Complex random( void );                           $\C{// [0.0, 1.0)+[0.0, 1.0)i}$
+double _Complex random( void );                          $\C{// [0.0, 1.0)+[0.0, 1.0)i}$
+long double _Complex random( void );             $\C{// [0.0, 1.0)+[0.0, 1.0)i}$
 \end{cfa}
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 forall( otype T | { int ?<?( T, T ); } ) T min( T t1, T t2 );§\indexc{min}§
 forall( otype T | { int ?>?( T, T ); } ) T max( T t1, T t2 );§\indexc{max}§
 forall( otype T | { T min( T, T ); T max( T, T ); } ) T clamp( T value, T min_val, T max_val );§\indexc{clamp}§
 forall( otype T ) void swap( T * t1, T * t2 );§\indexc{swap}§
+forall( otype T | { int ?<?( T, T ); } ) T min( T t1, T t2 );$\indexc{min}$
+forall( otype T | { int ?>?( T, T ); } ) T max( T t1, T t2 );$\indexc{max}$
+forall( otype T | { T min( T, T ); T max( T, T ); } ) T clamp( T value, T min_val, T max_val );$\indexc{clamp}$
+forall( otype T ) void swap( T * t1, T * t2 );$\indexc{swap}$
 \end{cfa}
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 float ?%?( float, float );§\indexc{fmod}§
+float ?%?( float, float );$\indexc{fmod}$
 float fmod( float, float );
 double ?%?( double, double );
 …
 long double fmod( long double, long double );
 float remainder( float, float );§\indexc{remainder}§
+float remainder( float, float );$\indexc{remainder}$
 double remainder( double, double );
 long double remainder( long double, long double );
 float remquo( float, float, int * );§\indexc{remquo}§
+float remquo( float, float, int * );$\indexc{remquo}$
 double remquo( double, double, int * );
 long double remquo( long double, long double, int * );
 …
 [ int, long double ] remquo( long double, long double );
 float div( float, float, int * );§\indexc{div}§ §\C{// alternative name for remquo}§
+float div( float, float, int * );$\indexc{div}$ $\C{// alternative name for remquo}$
 double div( double, double, int * );
 long double div( long double, long double, int * );
 …
 [ int, long double ] div( long double, long double );
 float fma( float, float, float );§\indexc{fma}§
+float fma( float, float, float );$\indexc{fma}$
 double fma( double, double, double );
 long double fma( long double, long double, long double );
 float fdim( float, float );§\indexc{fdim}§
+float fdim( float, float );$\indexc{fdim}$
 double fdim( double, double );
 long double fdim( long double, long double );
 float nan( const char * );§\indexc{nan}§
+float nan( const char * );$\indexc{nan}$
 double nan( const char * );
 long double nan( const char * );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 float exp( float );§\indexc{exp}§
+float exp( float );$\indexc{exp}$
 double exp( double );
 long double exp( long double );
 …
 long double _Complex exp( long double _Complex );
 float exp2( float );§\indexc{exp2}§
+float exp2( float );$\indexc{exp2}$
 double exp2( double );
 long double exp2( long double );
 …
 // long double _Complex exp2( long double _Complex );
 float expm1( float );§\indexc{expm1}§
+float expm1( float );$\indexc{expm1}$
 double expm1( double );
 long double expm1( long double );
 float pow( float, float );§\indexc{pow}§
+float pow( float, float );$\indexc{pow}$
 double pow( double, double );
 long double pow( long double, long double );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 float log( float );§\indexc{log}§
+float log( float );$\indexc{log}$
 double log( double );
 long double log( long double );
 …
 long double _Complex log( long double _Complex );
 float log2( float );§\indexc{log2}§
+float log2( float );$\indexc{log2}$
 double log2( double );
 long double log2( long double );
 …
 // long double _Complex log2( long double _Complex );
 float log10( float );§\indexc{log10}§
+float log10( float );$\indexc{log10}$
 double log10( double );
 long double log10( long double );
 …
 // long double _Complex log10( long double _Complex );
 float log1p( float );§\indexc{log1p}§
+float log1p( float );$\indexc{log1p}$
 double log1p( double );
 long double log1p( long double );
 int ilogb( float );§\indexc{ilogb}§
+int ilogb( float );$\indexc{ilogb}$
 int ilogb( double );
 int ilogb( long double );
 float logb( float );§\indexc{logb}§
+float logb( float );$\indexc{logb}$
 double logb( double );
 long double logb( long double );
 float sqrt( float );§\indexc{sqrt}§
+float sqrt( float );$\indexc{sqrt}$
 double sqrt( double );
 long double sqrt( long double );
 …
 long double _Complex sqrt( long double _Complex );
 float cbrt( float );§\indexc{cbrt}§
+float cbrt( float );$\indexc{cbrt}$
 double cbrt( double );
 long double cbrt( long double );
 float hypot( float, float );§\indexc{hypot}§
+float hypot( float, float );$\indexc{hypot}$
 double hypot( double, double );
 long double hypot( long double, long double );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 float sin( float );§\indexc{sin}§
+float sin( float );$\indexc{sin}$
 double sin( double );
 long double sin( long double );
 …
 long double _Complex sin( long double _Complex );
 float cos( float );§\indexc{cos}§
+float cos( float );$\indexc{cos}$
 double cos( double );
 long double cos( long double );
 …
 long double _Complex cos( long double _Complex );
 float tan( float );§\indexc{tan}§
+float tan( float );$\indexc{tan}$
 double tan( double );
 long double tan( long double );
 …
 long double _Complex tan( long double _Complex );
 float asin( float );§\indexc{asin}§
+float asin( float );$\indexc{asin}$
 double asin( double );
 long double asin( long double );
 …
 long double _Complex asin( long double _Complex );
 float acos( float );§\indexc{acos}§
+float acos( float );$\indexc{acos}$
 double acos( double );
 long double acos( long double );
 …
 long double _Complex acos( long double _Complex );
 float atan( float );§\indexc{atan}§
+float atan( float );$\indexc{atan}$
 double atan( double );
 long double atan( long double );
 …
 long double _Complex atan( long double _Complex );
 float atan2( float, float );§\indexc{atan2}§
+float atan2( float, float );$\indexc{atan2}$
 double atan2( double, double );
 long double atan2( long double, long double );
 float atan( float, float ); §\C{// alternative name for atan2}§
 double atan( double, double );§\indexc{atan}§
+float atan( float, float ); $\C{// alternative name for atan2}$
+double atan( double, double );$\indexc{atan}$
 long double atan( long double, long double );
 \end{cfa}
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 float sinh( float );§\indexc{sinh}§
+float sinh( float );$\indexc{sinh}$
 double sinh( double );
 long double sinh( long double );
 …
 long double _Complex sinh( long double _Complex );
 float cosh( float );§\indexc{cosh}§
+float cosh( float );$\indexc{cosh}$
 double cosh( double );
 long double cosh( long double );
 …
 long double _Complex cosh( long double _Complex );
 float tanh( float );§\indexc{tanh}§
+float tanh( float );$\indexc{tanh}$
 double tanh( double );
 long double tanh( long double );
 …
 long double _Complex tanh( long double _Complex );
 float asinh( float );§\indexc{asinh}§
+float asinh( float );$\indexc{asinh}$
 double asinh( double );
 long double asinh( long double );
 …
 long double _Complex asinh( long double _Complex );
 float acosh( float );§\indexc{acosh}§
+float acosh( float );$\indexc{acosh}$
 double acosh( double );
 long double acosh( long double );
 …
 long double _Complex acosh( long double _Complex );
 float atanh( float );§\indexc{atanh}§
+float atanh( float );$\indexc{atanh}$
 double atanh( double );
 long double atanh( long double );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 float erf( float );§\indexc{erf}§
+float erf( float );$\indexc{erf}$
 double erf( double );
 long double erf( long double );
 …
 long double _Complex erf( long double _Complex );
 float erfc( float );§\indexc{erfc}§
+float erfc( float );$\indexc{erfc}$
 double erfc( double );
 long double erfc( long double );
 …
 long double _Complex erfc( long double _Complex );
 float lgamma( float );§\indexc{lgamma}§
+float lgamma( float );$\indexc{lgamma}$
 double lgamma( double );
 long double lgamma( long double );
 …
 long double lgamma( long double, int * );
 float tgamma( float );§\indexc{tgamma}§
+float tgamma( float );$\indexc{tgamma}$
 double tgamma( double );
 long double tgamma( long double );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 float floor( float );§\indexc{floor}§
+float floor( float );$\indexc{floor}$
 double floor( double );
 long double floor( long double );
 float ceil( float );§\indexc{ceil}§
+float ceil( float );$\indexc{ceil}$
 double ceil( double );
 long double ceil( long double );
 float trunc( float );§\indexc{trunc}§
+float trunc( float );$\indexc{trunc}$
 double trunc( double );
 long double trunc( long double );
 float rint( float );§\indexc{rint}§
+float rint( float );$\indexc{rint}$
 long double rint( long double );
 long int rint( float );
 …
 long long int rint( long double );
 long int lrint( float );§\indexc{lrint}§
+long int lrint( float );$\indexc{lrint}$
 long int lrint( double );
 long int lrint( long double );
 …
 long long int llrint( long double );
 float nearbyint( float );§\indexc{nearbyint}§
+float nearbyint( float );$\indexc{nearbyint}$
 double nearbyint( double );
 long double nearbyint( long double );
 float round( float );§\indexc{round}§
+float round( float );$\indexc{round}$
 long double round( long double );
 long int round( float );
 …
 long long int round( long double );
 long int lround( float );§\indexc{lround}§
+long int lround( float );$\indexc{lround}$
 long int lround( double );
 long int lround( long double );
 …
 \leavevmode
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 float copysign( float, float );§\indexc{copysign}§
+float copysign( float, float );$\indexc{copysign}$
 double copysign( double, double );
 long double copysign( long double, long double );
 float frexp( float, int * );§\indexc{frexp}§
+float frexp( float, int * );$\indexc{frexp}$
 double frexp( double, int * );
 long double frexp( long double, int * );
 float ldexp( float, int );§\indexc{ldexp}§
+float ldexp( float, int );$\indexc{ldexp}$
 double ldexp( double, int );
 long double ldexp( long double, int );
 [ float, float ] modf( float );§\indexc{modf}§
+[ float, float ] modf( float );$\indexc{modf}$
 float modf( float, float * );
 [ double, double ] modf( double );
 …
 long double modf( long double, long double * );
 float nextafter( float, float );§\indexc{nextafter}§
+float nextafter( float, float );$\indexc{nextafter}$
 double nextafter( double, double );
 long double nextafter( long double, long double );
 float nexttoward( float, long double );§\indexc{nexttoward}§
+float nexttoward( float, long double );$\indexc{nexttoward}$
 double nexttoward( double, long double );
 long double nexttoward( long double, long double );
 float scalbn( float, int );§\indexc{scalbn}§
+float scalbn( float, int );$\indexc{scalbn}$
 double scalbn( double, int );
 long double scalbn( long double, int );
 float scalbln( float, long int );§\indexc{scalbln}§
+float scalbln( float, long int );$\indexc{scalbln}$
 double scalbln( double, long int );
 long double scalbln( long double, long int );
 …
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 struct Duration {
         int64_t tv; §\C{// nanoseconds}§
+        int64_t tv; $\C{// nanoseconds}$
 };
 …
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 struct Time {
         uint64_t tv; §\C{// nanoseconds since UNIX epoch}§
+        uint64_t tv; $\C{// nanoseconds since UNIX epoch}$
 };
 …
 \begin{cfa}[aboveskip=0pt,belowskip=0pt]
 struct Clock {
         Duration offset; §\C{// for virtual clock: contains offset from real-time}§
         int clocktype; §\C{// implementation only -1 (virtual), CLOCK\_REALTIME}§
+        Duration offset; $\C{// for virtual clock: contains offset from real-time}$
+        int clocktype; $\C{// implementation only -1 (virtual), CLOCK\_REALTIME}$
 };
 …
 void ?{}( Clock & clk, Duration adj );
 Duration getResNsec(); §\C{// with nanoseconds}§
 Duration getRes(); §\C{// without nanoseconds}§
 Time getTimeNsec(); §\C{// with nanoseconds}§
 Time getTime(); §\C{// without nanoseconds}§
+Duration getResNsec(); $\C{// with nanoseconds}$
+Duration getRes(); $\C{// without nanoseconds}$
+Time getTimeNsec(); $\C{// with nanoseconds}$
+Time getTime(); $\C{// without nanoseconds}$
 Time getTime( Clock & clk );
 Time ?()( Clock & clk );
 …
 \begin{cfa}
 void ?{}( Int * this ); §\C{// constructor/destructor}§
+void ?{}( Int * this ); $\C{// constructor/destructor}$
 void ?{}( Int * this, Int init );
 void ?{}( Int * this, zero_t );
 …
 void ^?{}( Int * this );
 Int ?=?( Int * lhs, Int rhs ); §\C{// assignment}§
+Int ?=?( Int * lhs, Int rhs ); $\C{// assignment}$
 Int ?=?( Int * lhs, long int rhs );
 Int ?=?( Int * lhs, unsigned long int rhs );
 …
 unsigned long int narrow( Int val );
 int ?==?( Int oper1, Int oper2 ); §\C{// comparison}§
+int ?==?( Int oper1, Int oper2 ); $\C{// comparison}$
 int ?==?( Int oper1, long int oper2 );
 int ?==?( long int oper2, Int oper1 );
 …
 int ?>=?( unsigned long int oper1, Int oper2 );
 Int +?( Int oper ); §\C{// arithmetic}§
+Int +?( Int oper ); $\C{// arithmetic}$
 Int -?( Int oper );
 Int ~?( Int oper );
 …
 Int ?>>=?( Int * lhs, mp_bitcnt_t shift );
 Int abs( Int oper ); §\C{// number functions}§
+Int abs( Int oper ); $\C{// number functions}$
 Int fact( unsigned long int N );
 Int gcd( Int oper1, Int oper2 );
 …
 Int sqrt( Int oper );
 forall( dtype istype | istream( istype ) ) istype * ?|?( istype * is, Int * mp );  §\C{// I/O}§
+forall( dtype istype | istream( istype ) ) istype * ?|?( istype * is, Int * mp );  $\C{// I/O}$
 forall( dtype ostype | ostream( ostype ) ) ostype * ?|?( ostype * os, Int mp );
 \end{cfa}
 …
 \hline
 \begin{cfa}
 #include <gmp>§\indexc{gmp}§
+#include <gmp>$\indexc{gmp}$
 int main( void ) {
         sout | "Factorial Numbers";
 …
+&
 \begin{cfa}
 #include <gmp.h>§\indexc{gmp.h}§
+#include <gmp.h>$\indexc{gmp.h}$
 int main( void ) {
         ®gmp_printf®( "Factorial Numbers\n" );
         ®mpz_t® fact;
         ®mpz_init_set_ui®( fact, 1 );
         ®gmp_printf®( "%d %Zd\n", 0, fact );
+        @gmp_printf@( "Factorial Numbers\n" );
+        @mpz_t@ fact;
+        @mpz_init_set_ui@( fact, 1 );
+        @gmp_printf@( "%d %Zd\n", 0, fact );
         for ( unsigned int i = 1; i <= 40; i += 1 ) {
                 ®mpz_mul_ui®( fact, fact, i );
                 ®gmp_printf®( "%d %Zd\n", i, fact );
+                @mpz_mul_ui@( fact, fact, i );
+                @gmp_printf@( "%d %Zd\n", i, fact );
+        }
+}
 …
 \begin{cfa}[belowskip=0pt]
 // implementation
 struct Rational {§\indexc{Rational}§
         long int numerator, denominator; §\C{// invariant: denominator > 0}§
+struct Rational {$\indexc{Rational}$
+        long int numerator, denominator; $\C{// invariant: denominator > 0}$
 }; // Rational
 Rational rational(); §\C{// constructors}§
+Rational rational(); $\C{// constructors}$
 Rational rational( long int n );
 Rational rational( long int n, long int d );
 …
 void ?{}( Rational * r, one_t );
 long int numerator( Rational r ); §\C{// numerator/denominator getter/setter}§
+long int numerator( Rational r ); $\C{// numerator/denominator getter/setter}$
 long int numerator( Rational r, long int n );
 long int denominator( Rational r );
 long int denominator( Rational r, long int d );
 int ?==?( Rational l, Rational r ); §\C{// comparison}§
+int ?==?( Rational l, Rational r ); $\C{// comparison}$
 int ?!=?( Rational l, Rational r );
 int ?<?( Rational l, Rational r );
 …
 int ?>=?( Rational l, Rational r );
 Rational -?( Rational r ); §\C{// arithmetic}§
+Rational -?( Rational r ); $\C{// arithmetic}$
 Rational ?+?( Rational l, Rational r );
 Rational ?-?( Rational l, Rational r );
 …
 Rational ?/?( Rational l, Rational r );
 double widen( Rational r ); §\C{// conversion}§
+double widen( Rational r ); $\C{// conversion}$
 Rational narrow( double f, long int md );

driver/cfa.cc

-              r342af53
+              r8e4aa05
 // Created On       : Tue Aug 20 13:44:49 2002
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov 17 14:27:28 2020
 // Update Count     : 440
+// Last Modified On : Sat Jan 16 07:30:19 2021
+// Update Count     : 442
 //
 …
                 args[nargs++] = "-no-integrated-cpp";
                 args[nargs++] = "-Wno-deprecated";
+                args[nargs++] = "-Wno-strict-aliasing";                 // casting from one type to another
                 #ifdef HAVE_CAST_FUNCTION_TYPE
                 args[nargs++] = "-Wno-cast-function-type";

libcfa/configure.ac

-              r342af53
+              r8e4aa05
 AH_TEMPLATE([CFA_HAVE_IOSQE_FIXED_FILE],[Defined if io_uring support is present when compiling libcfathread and supports the flag FIXED_FILE.])
 AH_TEMPLATE([CFA_HAVE_IOSQE_IO_DRAIN],[Defined if io_uring support is present when compiling libcfathread and supports the flag IO_DRAIN.])
-AH_TEMPLATE([CFA_HAVE_IOSQE_ASYNC],[Defined if io_uring support is present when compiling libcfathread and supports the flag ASYNC.])
 AH_TEMPLATE([CFA_HAVE_IOSQE_IO_LINK],[Defined if io_uring support is present when compiling libcfathread and supports the flag IO_LINK.])
 AH_TEMPLATE([CFA_HAVE_IOSQE_IO_HARDLINK],[Defined if io_uring support is present when compiling libcfathread and supports the flag IO_HARDLINK.])
+AH_TEMPLATE([CFA_HAVE_IOSQE_ASYNC],[Defined if io_uring support is present when compiling libcfathread and supports the flag ASYNC.])
+AH_TEMPLATE([CFA_HAVE_IOSQE_BUFFER_SELECT],[Defined if io_uring support is present when compiling libcfathread and supports the flag BUFFER_SELEC.])
 AH_TEMPLATE([CFA_HAVE_SPLICE_F_FD_IN_FIXED],[Defined if io_uring support is present when compiling libcfathread and supports the flag SPLICE_F_FD_IN_FIXED.])
 AH_TEMPLATE([CFA_HAVE_IORING_SETUP_ATTACH_WQ],[Defined if io_uring support is present when compiling libcfathread and supports the flag IORING_SETUP_ATTACH_WQ.])
 …
 define(ioring_ops, [IORING_OP_NOP,IORING_OP_READV,IORING_OP_WRITEV,IORING_OP_FSYNC,IORING_OP_READ_FIXED,IORING_OP_WRITE_FIXED,IORING_OP_POLL_ADD,IORING_OP_POLL_REMOVE,IORING_OP_SYNC_FILE_RANGE,IORING_OP_SENDMSG,IORING_OP_RECVMSG,IORING_OP_TIMEOUT,IORING_OP_TIMEOUT_REMOVE,IORING_OP_ACCEPT,IORING_OP_ASYNC_CANCEL,IORING_OP_LINK_TIMEOUT,IORING_OP_CONNECT,IORING_OP_FALLOCATE,IORING_OP_OPENAT,IORING_OP_CLOSE,IORING_OP_FILES_UPDATE,IORING_OP_STATX,IORING_OP_READ,IORING_OP_WRITE,IORING_OP_FADVISE,IORING_OP_MADVISE,IORING_OP_SEND,IORING_OP_RECV,IORING_OP_OPENAT2,IORING_OP_EPOLL_CTL,IORING_OP_SPLICE,IORING_OP_PROVIDE_BUFFERS,IORING_OP_REMOVE_BUFFER,IORING_OP_TEE])
 define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_ASYNC,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,SPLICE_F_FD_IN_FIXED,IORING_SETUP_ATTACH_WQ])
+define(ioring_flags, [IOSQE_FIXED_FILE,IOSQE_IO_DRAIN,IOSQE_IO_LINK,IOSQE_IO_HARDLINK,IOSQE_ASYNC,IOSQE_BUFFER_SELECT,SPLICE_F_FD_IN_FIXED,IORING_SETUP_ATTACH_WQ])
 define(ioring_from_decls, [

libcfa/prelude/builtins.c

-              r342af53
+              r8e4aa05
 // type that wraps a pointer and a destructor-like function - used in generating implicit destructor calls for struct members in user-defined functions
 // Note: needs to occur early, because it is used to generate destructor calls during code generation
 forall(dtype T)
+forall(T &)
 struct __Destructor {
         T * object;
 …
 // defined destructor in the case that non-generated code wants to use __Destructor
 forall(dtype T)
+forall(T &)
 static inline void ^?{}(__Destructor(T) & x) {
         if (x.object && x.dtor) {
 …
 // easy interface into __Destructor's destructor for easy codegen purposes
 extern "C" {
         forall(dtype T)
+        forall(T &)
         static inline void __destroy_Destructor(__Destructor(T) * dtor) {
                 ^(*dtor){};
 …
 void abort( const char fmt[], ... ) __attribute__ (( format(printf, 1, 2), __nothrow__, __leaf__, __noreturn__ ));
 forall(dtype T)
+forall(T &)
 static inline T & identity(T & i) {
         return i;
 …
 static inline void ^?{}($generator &) {}
 trait is_generator(dtype T) {
+trait is_generator(T &) {
       void main(T & this);
       $generator * get_generator(T & this);
 };
 forall(dtype T | is_generator(T))
+forall(T & | is_generator(T))
 static inline T & resume(T & gen) {
         main(gen);
 …
 static inline {
         forall( dtype DT | { DT & ?+=?( DT &, one_t ); } )
+        forall( DT & | { DT & ?+=?( DT &, one_t ); } )
         DT & ++?( DT & x ) { return x += 1; }
         forall( dtype DT | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?+=?( DT &, one_t ); } )
+        forall( DT & | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?+=?( DT &, one_t ); } )
         DT & ?++( DT & x ) { DT tmp = x; x += 1; return tmp; }
         forall( dtype DT | { DT & ?-=?( DT &, one_t ); } )
+        forall( DT & | { DT & ?-=?( DT &, one_t ); } )
         DT & --?( DT & x ) { return x -= 1; }
         forall( dtype DT | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?-=?( DT &, one_t ); } )
+        forall( DT & | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?-=?( DT &, one_t ); } )
         DT & ?--( DT & x ) { DT tmp = x; x -= 1; return tmp; }
         forall( dtype DT | { int ?!=?( const DT &, zero_t ); } )
+        forall( DT & | { int ?!=?( const DT &, zero_t ); } )
         int !?( const DT & x ) { return !( x != 0 ); }
 } // distribution
 // universal typed pointer constant
 static inline forall( dtype DT ) DT * intptr( uintptr_t addr ) { return (DT *)addr; }
+static inline forall( DT & ) DT * intptr( uintptr_t addr ) { return (DT *)addr; }
 static inline forall( ftype FT ) FT * intptr( uintptr_t addr ) { return (FT *)addr; }
 …
 #define __CFA_EXP_OVERFLOW__()
 static inline forall( otype OT | { void ?{}( OT & this, one_t ); OT ?*?( OT, OT ); } ) {
+static inline forall( OT | { void ?{}( OT & this, one_t ); OT ?*?( OT, OT ); } ) {
         OT ?\?( OT ep, unsigned int y ) { __CFA_EXP__(); }
         OT ?\?( OT ep, unsigned long int y ) { __CFA_EXP__(); }

libcfa/prelude/defines.hfa.in

-              r342af53
+              r8e4aa05
 /* Defined if io_uring support is present when compiling libcfathread and
+   supports the flag BUFFER_SELEC. */
+#undef CFA_HAVE_IOSQE_BUFFER_SELECT
+/* Defined if io_uring support is present when compiling libcfathread and
    supports the flag FIXED_FILE. */
 #undef CFA_HAVE_IOSQE_FIXED_FILE

libcfa/prelude/prelude-gen.cc

r342af53	r8e4aa05
159	159	int main() {
160	160	cout << "# 2 \"prelude.cfa\" // needed for error messages from this file" << endl;
161		cout << "trait sized(~~dtype T~~) {};" << endl;
	161	cout << "trait sized(T &) {};" << endl;
162	162
163	163	cout << "//////////////////////////" << endl;
…	…
264	264	for (auto cvq : qualifiersPair) {
265	265	for (auto is_vol : { " ", "volatile" }) {
266		cout << "forall(~~dtype DT~~) void ?{}(" << cvq.first << type << " * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
	266	cout << "forall(DT &) void ?{}(" << cvq.first << type << " * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
267	267	}
268	268	}
…	…
279	279	for (auto cvq : qualifiersSingle) {
280	280	for (auto is_vol : { " ", "volatile" }) {
281		cout << "forall(~~dtype DT~~) void ?{}(" << cvq << " DT" << " * " << is_vol << " &);" << endl;
	281	cout << "forall(DT &) void ?{}(" << cvq << " DT" << " * " << is_vol << " &);" << endl;
282	282	}
283	283	for (auto is_vol : { " ", "volatile" }) {
284		cout << "forall(~~dtype DT~~) void ^?{}(" << cvq << " DT" << " * " << is_vol << " &);" << endl;
	284	cout << "forall(DT &) void ^?{}(" << cvq << " DT" << " * " << is_vol << " &);" << endl;
285	285	}
286	286	}
…	…
290	290	for (auto is_vol : { " ", "volatile" }) {
291	291	for (auto cvq : qualifiersSingle) {
292		cout << "forall(~~dtype DT~~) void ?{}( " << cvq << type << " * " << is_vol << " &, zero_t);" << endl;
	292	cout << "forall(DT &) void ?{}( " << cvq << type << " * " << is_vol << " &, zero_t);" << endl;
293	293	}
294	294	}
…	…
317	317	for (auto op : pointerOperators) {
318	318	auto forall = [&op]() {
319		cout << "forall(~~dtype DT~~" << op.sized << ") ";
	319	cout << "forall(DT &" << op.sized << ") ";
320	320	};
321	321	for (auto type : { "DT"/, "void"/ } ) {
…	…
408	408	for (auto is_vol : { " ", "volatile" }) {
409	409	for (auto cvq : qualifiersPair) {
410		cout << "forall(~~dtype DT~~) " << cvq.first << "void * ?=?( " << cvq.first << "void * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
	410	cout << "forall(DT &) " << cvq.first << "void * ?=?( " << cvq.first << "void * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
411	411	}
412	412	for (auto cvq : qualifiersSingle) {
413		cout << "forall(~~dtype DT~~) " << cvq << " DT * ?=?( " << cvq << " DT * " << is_vol << " &, zero_t);" << endl;
	413	cout << "forall(DT &) " << cvq << " DT * ?=?( " << cvq << " DT * " << is_vol << " &, zero_t);" << endl;
414	414	}
415	415	}

libcfa/prelude/prelude.old.cf

-              r342af53
+              r8e4aa05
 // ------------------------------------------------------------
 trait sized(dtype T) {};
+trait sized(T &) {};
 // ------------------------------------------------------------
 …
 long double _Complex    ?--( long double _Complex & ),          ?--( volatile long double _Complex & );
 forall( dtype T | sized(T) ) T *                         ?++(                T *& );
 forall( dtype T | sized(T) ) const T *           ?++( const          T *& );
 forall( dtype T | sized(T) ) volatile T *                ?++(       volatile T *& );
 forall( dtype T | sized(T) ) const volatile T *  ?++( const volatile T *& );
 forall( dtype T | sized(T) ) T *                         ?--(                T *& );
 forall( dtype T | sized(T) ) const T *           ?--( const          T *& );
 forall( dtype T | sized(T) ) volatile T *                ?--(       volatile T *& );
 forall( dtype T | sized(T) ) const volatile T *  ?--( const volatile T *& );
 forall( dtype T | sized(T) ) T &                 ?[?](                T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const T &   ?[?]( const          T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T &        ?[?](       volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T & ?[?]( const volatile T *,           ptrdiff_t );
 forall( dtype T | sized(T) ) T &                 ?[?](          ptrdiff_t,                T * );
 forall( dtype T | sized(T) ) const T &   ?[?](          ptrdiff_t, const          T * );
 forall( dtype T | sized(T) ) volatile T &        ?[?](          ptrdiff_t,       volatile T * );
 forall( dtype T | sized(T) ) const volatile T & ?[?](           ptrdiff_t, const volatile T * );
+forall( T & | sized(T) ) T *                     ?++(                T *& );
+forall( T & | sized(T) ) const T *               ?++( const          T *& );
+forall( T & | sized(T) ) volatile T *            ?++(       volatile T *& );
+forall( T & | sized(T) ) const volatile T *      ?++( const volatile T *& );
+forall( T & | sized(T) ) T *                     ?--(                T *& );
+forall( T & | sized(T) ) const T *               ?--( const          T *& );
+forall( T & | sized(T) ) volatile T *            ?--(       volatile T *& );
+forall( T & | sized(T) ) const volatile T *      ?--( const volatile T *& );
+forall( T & | sized(T) ) T &             ?[?](                T *,          ptrdiff_t );
+forall( T & | sized(T) ) const T &       ?[?]( const          T *,          ptrdiff_t );
+forall( T & | sized(T) ) volatile T &    ?[?](       volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) const volatile T & ?[?]( const volatile T *,       ptrdiff_t );
+forall( T & | sized(T) ) T &             ?[?](          ptrdiff_t,                T * );
+forall( T & | sized(T) ) const T &       ?[?](          ptrdiff_t, const          T * );
+forall( T & | sized(T) ) volatile T &    ?[?](          ptrdiff_t,       volatile T * );
+forall( T & | sized(T) ) const volatile T & ?[?](               ptrdiff_t, const volatile T * );
 // ------------------------------------------------------------
 …
 long double _Complex    ++?( long double _Complex & ),          --?( long double _Complex & );
 forall( dtype T | sized(T) ) T *                         ++?(                T *& );
 forall( dtype T | sized(T) ) const T *           ++?( const          T *& );
 forall( dtype T | sized(T) ) volatile T *                ++?(       volatile T *& );
 forall( dtype T | sized(T) ) const volatile T *  ++?( const volatile T *& );
 forall( dtype T | sized(T) ) T *                         --?(                T *& );
 forall( dtype T | sized(T) ) const T *           --?( const          T *& );
 forall( dtype T | sized(T) ) volatile T *                --?(       volatile T *& );
 forall( dtype T | sized(T) ) const volatile T *  --?( const volatile T *& );
 forall( dtype T | sized(T) ) T &                 *?(                 T * );
 forall( dtype T | sized(T) ) const T &           *?( const           T * );
 forall( dtype T | sized(T) ) volatile T &        *?(       volatile  T * );
 forall( dtype T | sized(T) ) const volatile T & *?( const volatile  T * );
+forall( T & | sized(T) ) T *                     ++?(                T *& );
+forall( T & | sized(T) ) const T *               ++?( const          T *& );
+forall( T & | sized(T) ) volatile T *            ++?(       volatile T *& );
+forall( T & | sized(T) ) const volatile T *      ++?( const volatile T *& );
+forall( T & | sized(T) ) T *                     --?(                T *& );
+forall( T & | sized(T) ) const T *               --?( const          T *& );
+forall( T & | sized(T) ) volatile T *            --?(       volatile T *& );
+forall( T & | sized(T) ) const volatile T *      --?( const volatile T *& );
+forall( T & | sized(T) ) T &             *?(                 T * );
+forall( T & | sized(T) ) const T &               *?( const           T * );
+forall( T & | sized(T) ) volatile T &    *?(       volatile  T * );
+forall( T & | sized(T) ) const volatile T & *?( const volatile  T * );
 forall( ftype FT ) FT &          *?( FT * );
 …
                 !?( float _Complex ),           !?( double _Complex ),          !?( long double _Complex );
 forall( dtype DT ) int !?(                DT * );
 forall( dtype DT ) int !?( const          DT * );
 forall( dtype DT ) int !?(       volatile DT * );
 forall( dtype DT ) int !?( const volatile DT * );
+forall( DT & ) int !?(                DT * );
+forall( DT & ) int !?( const          DT * );
+forall( DT & ) int !?(       volatile DT * );
+forall( DT & ) int !?( const volatile DT * );
 forall( ftype FT ) int !?( FT * );
 …
 long double _Complex    ?+?( long double _Complex, long double _Complex ),      ?-?( long double _Complex, long double _Complex );
 forall( dtype T | sized(T) ) T *                ?+?(                T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) T *                ?+?(          ptrdiff_t,                T * );
 forall( dtype T | sized(T) ) const T *          ?+?( const          T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?+?(          ptrdiff_t, const          T * );
 forall( dtype T | sized(T) ) volatile T *       ?+?(       volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?+?(          ptrdiff_t,       volatile T * );
 forall( dtype T | sized(T) ) const volatile T * ?+?( const volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?+?(          ptrdiff_t, const volatile T * );
 forall( dtype T | sized(T) ) T *                ?-?(                T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?-?( const          T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?-?(       volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?-?( const volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) ptrdiff_t          ?-?( const volatile T *, const volatile T * );
+forall( T & | sized(T) ) T *            ?+?(                T *,          ptrdiff_t );
+forall( T & | sized(T) ) T *            ?+?(          ptrdiff_t,                T * );
+forall( T & | sized(T) ) const T *              ?+?( const          T *,          ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?+?(          ptrdiff_t, const          T * );
+forall( T & | sized(T) ) volatile T *   ?+?(       volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?+?(          ptrdiff_t,       volatile T * );
+forall( T & | sized(T) ) const volatile T *     ?+?( const volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?+?(          ptrdiff_t, const volatile T * );
+forall( T & | sized(T) ) T *            ?-?(                T *,          ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?-?( const          T *,          ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?-?(       volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?-?( const volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) ptrdiff_t              ?-?( const volatile T *, const volatile T * );
 // ------------------------------------------------------------
 …
            ?>?( long double, long double ),                             ?>=?( long double, long double );
 forall( dtype DT ) signed int ?<?(                 DT *,                DT * );
 forall( dtype DT ) signed int ?<?(  const          DT *, const          DT * );
 forall( dtype DT ) signed int ?<?(        volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?<?(  const volatile DT *, const volatile DT * );
 forall( dtype DT ) signed int ?>?(                 DT *,                DT * );
 forall( dtype DT ) signed int ?>?(  const          DT *, const          DT * );
 forall( dtype DT ) signed int ?>?(        volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?>?(  const volatile DT *, const volatile DT * );
 forall( dtype DT ) signed int ?<=?(                 DT *,                DT * );
 forall( dtype DT ) signed int ?<=?(  const          DT *, const          DT * );
 forall( dtype DT ) signed int ?<=?(        volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?<=?( const volatile DT *, const volatile DT * );
 forall( dtype DT ) signed int ?>=?(                 DT *,                DT * );
 forall( dtype DT ) signed int ?>=?(  const          DT *, const          DT * );
 forall( dtype DT ) signed int ?>=?(        volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?>=?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?<?(                 DT *,                DT * );
+forall( DT & ) signed int ?<?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?<?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?<?(  const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?>?(                 DT *,                DT * );
+forall( DT & ) signed int ?>?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?>?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?>?(  const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?<=?(                 DT *,                DT * );
+forall( DT & ) signed int ?<=?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?<=?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?<=?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?>=?(                 DT *,                DT * );
+forall( DT & ) signed int ?>=?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?>=?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?>=?( const volatile DT *, const volatile DT * );
 // ------------------------------------------------------------
 …
 signed int ?==?( one_t, one_t ),                                                        ?!=?( one_t, one_t );
 forall( dtype DT ) signed int ?==?(                DT *,                DT * );
 forall( dtype DT ) signed int ?==?( const          DT *, const          DT * );
 forall( dtype DT ) signed int ?==?(       volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?==?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?==?(            DT *,                DT * );
+forall( DT & ) signed int ?==?( const      DT *, const          DT * );
+forall( DT & ) signed int ?==?(       volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?==?( const volatile DT *, const volatile DT * );
 forall( ftype FT ) signed int ?==?( FT *, FT * );
 forall( dtype DT ) signed int ?!=?(                DT *,                DT * );
 forall( dtype DT ) signed int ?!=?( const          DT *, const          DT * );
 forall( dtype DT ) signed int ?!=?(       volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?!=?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?!=?(            DT *,                DT * );
+forall( DT & ) signed int ?!=?( const      DT *, const          DT * );
+forall( DT & ) signed int ?!=?(       volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?!=?( const volatile DT *, const volatile DT * );
 forall( ftype FT ) signed int ?!=?( FT *, FT * );
 …
 forall( ftype FT ) FT *                 ?=?( FT *&, FT * );
 forall( ftype FT ) FT *                 ?=?( FT * volatile &, FT * );
 forall( dtype DT ) DT *                 ?=?(                 DT *          &,                   DT * );
 forall( dtype DT ) DT *                 ?=?(                 DT * volatile &,                   DT * );
 forall( dtype DT ) const DT *           ?=?( const           DT *          &,                   DT * );
 forall( dtype DT ) const DT *           ?=?( const           DT * volatile &,                   DT * );
 forall( dtype DT ) const DT *           ?=?( const           DT *          &, const             DT * );
 forall( dtype DT ) const DT *           ?=?( const           DT * volatile &, const             DT * );
 forall( dtype DT ) volatile DT *        ?=?(       volatile  DT *          &,                   DT * );
 forall( dtype DT ) volatile DT *        ?=?(       volatile  DT * volatile &,                   DT * );
 forall( dtype DT ) volatile DT *        ?=?(       volatile  DT *          &,       volatile    DT * );
 forall( dtype DT ) volatile DT *        ?=?(       volatile  DT * volatile &,       volatile    DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *          &,                   DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &,                   DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *          &, const             DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &, const             DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *          &,       volatile    DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &,       volatile    DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *          &, const volatile    DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &, const volatile    DT * );
 forall( dtype DT ) void *                ?=?(                void *          &,                 DT * );
 forall( dtype DT ) void *                ?=?(                void * volatile &,                 DT * );
 forall( dtype DT ) const void *          ?=?( const          void *          &,                 DT * );
 forall( dtype DT ) const void *          ?=?( const          void * volatile &,                 DT * );
 forall( dtype DT ) const void *          ?=?( const          void *          &, const           DT * );
 forall( dtype DT ) const void *          ?=?( const          void * volatile &, const           DT * );
 forall( dtype DT ) volatile void *       ?=?(       volatile void *          &,                 DT * );
 forall( dtype DT ) volatile void *       ?=?(       volatile void * volatile &,                 DT * );
 forall( dtype DT ) volatile void *       ?=?(       volatile void *          &,       volatile  DT * );
 forall( dtype DT ) volatile void *       ?=?(       volatile void * volatile &,       volatile  DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void *          &,                 DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &,                 DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void *          &, const           DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &, const           DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void *          &,       volatile  DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &,       volatile  DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void *          &, const volatile  DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &, const volatile  DT * );
+forall( ftyep FT ) FT *                 ?=?( FT * volatile &, FT * );
+forall( DT & ) DT *                     ?=?(                 DT *          &,                   DT * );
+forall( DT & ) DT *                     ?=?(                 DT * volatile &,                   DT * );
+forall( DT & ) const DT *               ?=?( const           DT *          &,                   DT * );
+forall( DT & ) const DT *               ?=?( const           DT * volatile &,                   DT * );
+forall( DT & ) const DT *               ?=?( const           DT *          &, const             DT * );
+forall( DT & ) const DT *               ?=?( const           DT * volatile &, const             DT * );
+forall( DT & ) volatile DT *    ?=?(       volatile  DT *          &,                   DT * );
+forall( DT & ) volatile DT *    ?=?(       volatile  DT * volatile &,                   DT * );
+forall( DT & ) volatile DT *    ?=?(       volatile  DT *          &,       volatile    DT * );
+forall( DT & ) volatile DT *    ?=?(       volatile  DT * volatile &,       volatile    DT * );
+forall( DT & ) const volatile DT *      ?=?( const volatile  DT *          &,                   DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &,                       DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *      &, const             DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &, const         DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *      &,       volatile    DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &,           volatile    DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *      &, const volatile    DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &, const volatile        DT * );
+forall( DT & ) void *            ?=?(                void *          &,                 DT * );
+forall( DT & ) void *            ?=?(                void * volatile &,                 DT * );
+forall( DT & ) const void *              ?=?( const          void *          &,                 DT * );
+forall( DT & ) const void *              ?=?( const          void * volatile &,                 DT * );
+forall( DT & ) const void *              ?=?( const          void *          &, const           DT * );
+forall( DT & ) const void *              ?=?( const          void * volatile &, const           DT * );
+forall( DT & ) volatile void *   ?=?(       volatile void *          &,                 DT * );
+forall( DT & ) volatile void *   ?=?(       volatile void * volatile &,                 DT * );
+forall( DT & ) volatile void *   ?=?(       volatile void *          &,       volatile  DT * );
+forall( DT & ) volatile void *   ?=?(       volatile void * volatile &,       volatile  DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *      &,                 DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &,                     DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *      &, const           DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &, const               DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *      &,       volatile  DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &,           volatile  DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *      &, const volatile  DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &, const volatile      DT * );
 //forall( dtype DT ) DT *                       ?=?(                DT *          &, zero_t );
 //forall( dtype DT ) DT *                       ?=?(                DT * volatile &, zero_t );
 forall( dtype DT ) const DT *           ?=?( const          DT *          &, zero_t );
 forall( dtype DT ) const DT *           ?=?( const          DT * volatile &, zero_t );
+forall( DT & ) const DT *               ?=?( const          DT *          &, zero_t );
+forall( DT & ) const DT *               ?=?( const          DT * volatile &, zero_t );
 //forall( dtype DT ) volatile DT *      ?=?( volatile       DT *          &, zero_t );
 //forall( dtype DT ) volatile DT *      ?=?( volatile       DT * volatile &, zero_t );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile DT *          &, zero_t );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile DT * volatile &, zero_t );
+forall( DT & ) const volatile DT *      ?=?( const volatile DT *          &, zero_t );
+forall( DT & ) const volatile DT *      ?=?( const volatile DT * volatile &, zero_t );
 forall( ftype FT ) FT *                 ?=?( FT *          &, zero_t );
 forall( ftype FT ) FT *                 ?=?( FT * volatile &, zero_t );
 forall( dtype T | sized(T) ) T *                ?+=?(                T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) T *                ?+=?(                T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?+=?( const          T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?+=?( const          T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?+=?(       volatile T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?+=?(       volatile T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?+=?( const volatile T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?+=?( const volatile T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) T *                ?-=?(                T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) T *                ?-=?(                T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?-=?( const          T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?-=?( const          T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?-=?(       volatile T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?-=?(       volatile T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?-=?( const volatile T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?-=?( const volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) T *            ?+=?(                T *          &, ptrdiff_t );
+forall( T & | sized(T) ) T *            ?+=?(                T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?+=?( const          T *          &, ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?+=?( const          T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?+=?(       volatile T *          &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?+=?(       volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?+=?( const volatile T *          &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?+=?( const volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) T *            ?-=?(                T *          &, ptrdiff_t );
+forall( T & | sized(T) ) T *            ?-=?(                T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?-=?( const          T *          &, ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?-=?( const          T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?-=?(       volatile T *          &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?-=?(       volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?-=?( const volatile T *          &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?-=?( const volatile T * volatile &, ptrdiff_t );
 _Bool                   ?=?( _Bool &, _Bool ),                                  ?=?( volatile _Bool &, _Bool );
 …
 forall( ftype FT ) void ?{}( FT * volatile &, FT * );
 forall( dtype DT ) void ?{}(                 DT *          &,                   DT * );
 forall( dtype DT ) void ?{}( const           DT *          &,                   DT * );
 forall( dtype DT ) void ?{}( const           DT *          &, const             DT * );
 forall( dtype DT ) void ?{}(       volatile  DT *          &,                   DT * );
 forall( dtype DT ) void ?{}(       volatile  DT *          &,       volatile    DT * );
 forall( dtype DT ) void ?{}( const volatile  DT *          &,                   DT * );
 forall( dtype DT ) void ?{}( const volatile  DT *          &, const             DT * );
 forall( dtype DT ) void ?{}( const volatile  DT *          &,       volatile    DT * );
 forall( dtype DT ) void ?{}( const volatile  DT *          &, const volatile    DT * );
 forall( dtype DT ) void ?{}(                 void *          &,                 DT * );
 forall( dtype DT ) void ?{}( const           void *          &,                 DT * );
 forall( dtype DT ) void ?{}( const           void *          &, const           DT * );
 forall( dtype DT ) void ?{}(        volatile void *          &,                 DT * );
 forall( dtype DT ) void ?{}(        volatile void *          &,       volatile  DT * );
 forall( dtype DT ) void ?{}( const volatile void *           &,                 DT * );
 forall( dtype DT ) void ?{}( const volatile void *           &, const           DT * );
 forall( dtype DT ) void ?{}( const volatile void *           &,       volatile  DT * );
 forall( dtype DT ) void ?{}( const volatile void *           &, const volatile  DT * );
+forall( DT & ) void ?{}(                     DT *          &,                   DT * );
+forall( DT & ) void ?{}( const       DT *          &,                   DT * );
+forall( DT & ) void ?{}( const       DT *          &, const             DT * );
+forall( DT & ) void ?{}(           volatile  DT *          &,                   DT * );
+forall( DT & ) void ?{}(           volatile  DT *          &,       volatile    DT * );
+forall( DT & ) void ?{}( const volatile  DT *      &,                   DT * );
+forall( DT & ) void ?{}( const volatile  DT *      &, const             DT * );
+forall( DT & ) void ?{}( const volatile  DT *      &,       volatile    DT * );
+forall( DT & ) void ?{}( const volatile  DT *      &, const volatile    DT * );
+forall( DT & ) void ?{}(                     void *          &,                 DT * );
+forall( DT & ) void ?{}( const       void *          &,                 DT * );
+forall( DT & ) void ?{}( const       void *          &, const           DT * );
+forall( DT & ) void ?{}(            volatile void *          &,                 DT * );
+forall( DT & ) void ?{}(            volatile void *          &,       volatile  DT * );
+forall( DT & ) void ?{}( const volatile void *       &,                 DT * );
+forall( DT & ) void ?{}( const volatile void *       &, const           DT * );
+forall( DT & ) void ?{}( const volatile void *       &,       volatile  DT * );
+forall( DT & ) void ?{}( const volatile void *       &, const volatile  DT * );
 //forall( dtype DT ) void ?{}(              DT *          &, zero_t );
 //forall( dtype DT ) void ?{}(              DT * volatile &, zero_t );
 forall( dtype DT ) void ?{}( const          DT *          &, zero_t );
+forall( DT & ) void ?{}( const      DT *          &, zero_t );
 //forall( dtype DT ) void ?{}( volatile     DT *          &, zero_t );
 //forall( dtype DT ) void ?{}( volatile     DT * volatile &, zero_t );
 forall( dtype DT ) void ?{}( const volatile DT *          &, zero_t );
+forall( DT & ) void ?{}( const volatile DT *      &, zero_t );
 forall( ftype FT ) void ?{}( FT *          &, zero_t );
 …
 forall( ftype FT ) void ?{}( FT *          & );
 forall( dtype DT ) void ?{}(                 DT *          &);
 forall( dtype DT ) void ?{}( const           DT *          &);
 forall( dtype DT ) void ?{}(       volatile  DT *          &);
 forall( dtype DT ) void ?{}( const volatile  DT *          &);
+forall( DT & ) void     ?{}(                 DT *          &);
+forall( DT & ) void     ?{}( const           DT *          &);
+forall( DT & ) void     ?{}(       volatile  DT *          &);
+forall( DT & ) void ?{}( const volatile  DT *      &);
 void    ?{}(                void *          &);
 …
 forall( ftype FT ) void ^?{}( FT *         & );
 forall( dtype DT ) void ^?{}(                DT *          &);
 forall( dtype DT ) void ^?{}( const          DT *          &);
 forall( dtype DT ) void ^?{}(      volatile  DT *          &);
 forall( dtype DT ) void ^?{}( const volatile  DT *         &);
+forall( DT & ) void     ^?{}(                DT *          &);
+forall( DT & ) void     ^?{}( const          DT *          &);
+forall( DT & ) void     ^?{}(      volatile  DT *          &);
+forall( DT & ) void ^?{}( const volatile  DT *     &);
 void ^?{}(                  void *          &);

libcfa/prelude/sync-builtins.cf

-              r342af53
+              r8e4aa05
 _Bool __sync_bool_compare_and_swap(volatile unsigned __int128 *, unsigned __int128, unsigned __int128,...);
 #endif
 forall(dtype T) _Bool __sync_bool_compare_and_swap(T * volatile *, T *, T*, ...);
+forall(T &) _Bool __sync_bool_compare_and_swap(T * volatile *, T *, T*, ...);
 char __sync_val_compare_and_swap(volatile char *, char, char,...);
 …
 unsigned __int128 __sync_val_compare_and_swap(volatile unsigned __int128 *, unsigned __int128, unsigned __int128,...);
 #endif
 forall(dtype T) T * __sync_val_compare_and_swap(T * volatile *, T *, T*,...);
+forall(T &) T * __sync_val_compare_and_swap(T * volatile *, T *, T*,...);
 char __sync_lock_test_and_set(volatile char *, char,...);
 …
 void __atomic_exchange(volatile unsigned __int128 *, volatile unsigned __int128 *, volatile unsigned __int128 *, int);
 #endif
 forall(dtype T) T * __atomic_exchange_n(T * volatile *, T *, int);
 forall(dtype T) void __atomic_exchange(T * volatile *, T * volatile *, T * volatile *, int);
+forall(T &) T * __atomic_exchange_n(T * volatile *, T *, int);
+forall(T &) void __atomic_exchange(T * volatile *, T * volatile *, T * volatile *, int);
 _Bool __atomic_load_n(const volatile _Bool *, int);
 …
 void __atomic_load(const volatile unsigned __int128 *, volatile unsigned __int128 *, int);
 #endif
 forall(dtype T) T * __atomic_load_n(T * const volatile *, int);
 forall(dtype T) void __atomic_load(T * const volatile *, T **, int);
+forall(T &) T * __atomic_load_n(T * const volatile *, int);
+forall(T &) void __atomic_load(T * const volatile *, T **, int);
 _Bool __atomic_compare_exchange_n(volatile char *, char *, char, _Bool, int, int);
 …
 _Bool __atomic_compare_exchange   (volatile unsigned __int128 *, unsigned __int128 *, unsigned __int128 *, _Bool, int, int);
 #endif
 forall(dtype T) _Bool __atomic_compare_exchange_n (T * volatile *, T **, T*, _Bool, int, int);
 forall(dtype T) _Bool __atomic_compare_exchange   (T * volatile *, T **, T**, _Bool, int, int);
+forall(T &) _Bool __atomic_compare_exchange_n (T * volatile *, T **, T*, _Bool, int, int);
+forall(T &) _Bool __atomic_compare_exchange   (T * volatile *, T **, T**, _Bool, int, int);
 void __atomic_store_n(volatile _Bool *, _Bool, int);
 …
 void __atomic_store(volatile unsigned __int128 *, unsigned __int128 *, int);
 #endif
 forall(dtype T) void __atomic_store_n(T * volatile *, T *, int);
 forall(dtype T) void __atomic_store(T * volatile *, T **, int);
+forall(T &) void __atomic_store_n(T * volatile *, T *, int);
+forall(T &) void __atomic_store(T * volatile *, T **, int);
 char __atomic_add_fetch  (volatile char *, char, int);

libcfa/src/Makefile.am

r342af53	r8e4aa05
76	76	stdlib.hfa \
77	77	time.hfa \
	78	bits/weakso_locks.hfa \
78	79	containers/maybe.hfa \
79	80	containers/pair.hfa \

libcfa/src/bitmanip.hfa

-              r342af53
+              r8e4aa05
         unsigned long long int floor2( unsigned long long int n, unsigned long long int align ) { verify( is_pow2( align ) ); return n & -align; }
         // forall( otype T | { T ?&?( T, T ); T -?( T ); } )
+        // forall( T | { T ?&?( T, T ); T -?( T ); } )
         // T floor2( T n, T align ) { verify( is_pow2( align ) ); return n & -align; }
 …
         unsigned long long int ceiling2( unsigned long long int n, unsigned long long int align ) { verify( is_pow2( align ) ); return -floor2( -n, align ); }
         // forall( otype T | { T floor2( T, T ); T -?( T ); } )
+        // forall( T | { T floor2( T, T ); T -?( T ); } )
         // T ceiling2( T n, T align ) { verify( is_pow2( align ) ); return -floor2( -n, align ); }
 } // distribution

libcfa/src/bits/algorithm.hfa

-              r342af53
+              r8e4aa05
 #ifdef SAFE_SORT
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort2( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort3( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort4( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort5( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort6( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sortN( T * arr, size_t dim );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort2( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort3( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort4( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort5( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort6( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sortN( T * arr, size_t dim );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort( T * arr, size_t dim ) {
         switch( dim ) {
 …
 #define SWAP(x,y) { T a = min(arr[x], arr[y]); T b = max(arr[x], arr[y]); arr[x] = a; arr[y] = b;}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort2( T * arr ) {
         SWAP(0, 1);
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort3( T * arr ) {
         SWAP(1, 2);
 …
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort4( T * arr ) {
         SWAP(0, 1);
 …
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort5( T * arr ) {
         SWAP(0, 1);
 …
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort6( T * arr ) {
         SWAP(1, 2);
 …
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sortN( T * arr, size_t dim ) {
         int i, j;
 …
 static inline void __libcfa_small_sortN( void* * arr, size_t dim );
 forall( dtype T )
+forall( T & )
 static inline void __libcfa_small_sort( T* * arr, size_t dim ) {
         switch( dim ) {

libcfa/src/bits/collection.hfa

-              r342af53
+              r8e4aa05
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// bits/collection.hfa -- PUBLIC
+// Intrusive singly-linked list
+//
+// Author           : Colby Alexander Parsons & Peter A. Buhr
+// Created On       : Thu Jan 21 19:46:50 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
 #pragma once
-#include <stdio.h> // REMOVE THIS AFTER DEBUGGING
 struct Colable {
         struct Colable * next;                                                                          // next node in the list
+        // next node in the list
         // invariant: (next != 0) <=> listed()
+        struct Colable * next;
 };
 #ifdef __cforall
 …
         // // wrappers to make Collection have T
         // forall( dtype T ) {
+        // forall( T & ) {
         //      T *& Next( T * n ) {
         //              return (T *)Next( (Colable *)n );
 …
 } // distribution
 forall( dtype T | { T *& Next ( T * ); } ) {
+static inline forall( T & | { T *& Next ( T * ); } ) {
         bool listed( T * n ) {
                 return Next( n ) != 0p;
 …
         Collection & ?=?( const Collection & ) = void;          // no assignment
         void ?{}( Collection & collection ) with( collection ) {
+        void ?{}( Collection & collection ) with( collection ) {
                 root = 0p;
         } // post: empty()
 …
         } // post: elts = null
         forall( dtype T ) {
+        forall( T & ) {
                 T * Curr( ColIter & ci ) with( ci ) {
                         return (T *)curr;

libcfa/src/bits/containers.hfa

-              r342af53
+              r8e4aa05
 #ifdef __cforall
         forall(dtype T)
+        forall(T &)
 #else
         #define T void
 …
 #ifdef __cforall
         // forall(otype T | sized(T))
+        // forall(T | sized(T))
         // static inline void ?{}(__small_array(T) & this) {}
         forall(dtype T | sized(T))
+        forall(T & | sized(T))
         static inline T & ?[?]( __small_array(T) & this, __lock_size_t idx ) {
                 return ((typeof(this.data))this.data)[idx];
+        }
         forall(dtype T | sized(T))
+        forall(T & | sized(T))
         static inline T & ?[?]( const __small_array(T) & this, __lock_size_t idx ) {
                 return ((typeof(this.data))this.data)[idx];
+        }
         forall(dtype T)
+        forall(T &)
         static inline T * begin( const __small_array(T) & this ) {
                 return ((typeof(this.data))this.data);
+        }
         forall(dtype T | sized(T))
+        forall(T & | sized(T))
         static inline T * end( const __small_array(T) & this ) {
                 return ((typeof(this.data))this.data) + this.size;
 …
 #ifdef __cforall
         trait is_node(dtype T) {
+        trait is_node(T &) {
                 T *& get_next( T & );
         };
 …
 //-----------------------------------------------------------------------------
 #ifdef __cforall
         forall(dtype TYPE)
+        forall(TYPE &)
         #define T TYPE
 #else
 …
 #ifdef __cforall
         forall(dtype T)
+        forall(T &)
         static inline void ?{}( __stack(T) & this ) {
                 (this.top){ 0p };
+        }
         static inline forall( dtype T | is_node(T) ) {
+        static inline forall( T & | is_node(T) ) {
                 void push( __stack(T) & this, T * val ) {
                         verify( !get_next( *val ) );
 …
 //-----------------------------------------------------------------------------
 #ifdef __cforall
         forall(dtype TYPE)
+        forall(TYPE &)
         #define T TYPE
 #else
 …
 #ifdef __cforall
         static inline forall( dtype T | is_node(T) ) {
+        static inline forall( T & | is_node(T) ) {
                 void ?{}( __queue(T) & this ) with( this ) {
                         (this.head){ 1p };
 …
+                }
                 void append( __queue(T) & this, T * val ) with( this ) {
+                void append( __queue(T) & this, T * val ) with(this) {
                         verify(this.tail != 0p);
                         verify(*this.tail == 1p);
 …
                 T * peek( __queue(T) & this ) {
                         verify(*this.tail == 1p);
                         T * front = this.head;
                         if( front != 1p ) {
+                        T * frontnode = this.head;
+                        if( frontnode != 1p ) {
                                 verify(*this.tail == 1p);
                                 return front;
+                                return frontnode;
+                        }
                         verify(*this.tail == 1p);
 …
 //-----------------------------------------------------------------------------
 #ifdef __cforall
         forall(dtype TYPE)
+        forall(TYPE &)
         #define T TYPE
         #define __getter_t * [T * & next, T * & prev] ( T & )
 …
 #ifdef __cforall
         forall(dtype T )
+        forall(T & )
         static inline [void] ?{}( __dllist(T) & this, * [T * & next, T * & prev] ( T & ) __get ) {
                 (this.head){ 0p };
 …
         #define next 0
         #define prev 1
         static inline forall(dtype T) {
+        static inline forall(T &) {
                 void push_front( __dllist(T) & this, T & node ) with( this ) {
                         verify(__get);

libcfa/src/bits/defs.hfa

-              r342af53
+              r8e4aa05
 // file "LICENCE" distributed with Cforall.
 //
+// defs.hfa --
+// defs.hfa -- Commen macros, functions and typedefs
+// Most files depend on them and they are always useful to have.
+//
+//  *** Must not contain code specific to libcfathread ***
 //
 // Author           : Thierry Delisle
 …
         #endif
+}
+// pause to prevent excess processor bus usage
+#if defined( __i386 ) || defined( __x86_64 )
+        #define Pause() __asm__ __volatile__ ( "pause" : : : )
+#elif defined( __ARM_ARCH )
+        #define Pause() __asm__ __volatile__ ( "YIELD" : : : )
+#else
+        #error unsupported architecture
+#endif
+#define CFA_IO_LAZY (1_l64u << 32_l64u)

libcfa/src/bits/locks.hfa

-              r342af53
+              r8e4aa05
 // file "LICENCE" distributed with Cforall.
 //
+// bits/locks.hfa -- Fast internal locks.
+// bits/locks.hfa -- Basic spinlocks that are reused in the system.
+// Used for locks that aren't specific to cforall threads and can be used anywhere
+//
+//  *** Must not contain code specific to libcfathread ***
 //
 // Author           : Thierry Delisle
 …
 #include "bits/defs.hfa"
 #include <assert.h>
-#ifdef __cforall
-        extern "C" {
-                #include <pthread.h>
+        }
-#endif
-// pause to prevent excess processor bus usage
-#if defined( __i386 ) || defined( __x86_64 )
-        #define Pause() __asm__ __volatile__ ( "pause" : : : )
-#elif defined( __ARM_ARCH )
-        #define Pause() __asm__ __volatile__ ( "YIELD" : : : )
-#else
-        #error unsupported architecture
-#endif
 struct __spinlock_t {
 …
                 enable_interrupts_noPoll();
+        }
-        #ifdef __CFA_WITH_VERIFY__
-                extern bool __cfaabi_dbg_in_kernel();
-        #endif
-        extern "C" {
-                char * strerror(int);
+        }
-        #define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
-        struct __bin_sem_t {
-                pthread_mutex_t         lock;
-                pthread_cond_t          cond;
-                int                     val;
-        };
-        static inline void ?{}(__bin_sem_t & this) with( this ) {
-                // Create the mutex with error checking
-                pthread_mutexattr_t mattr;
-                pthread_mutexattr_init( &mattr );
-                pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
-                pthread_mutex_init(&lock, &mattr);
-                pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
-                val = 0;
+        }
-        static inline void ^?{}(__bin_sem_t & this) with( this ) {
-                CHECKED( pthread_mutex_destroy(&lock) );
-                CHECKED( pthread_cond_destroy (&cond) );
+        }
-        static inline void wait(__bin_sem_t & this) with( this ) {
-                verify(__cfaabi_dbg_in_kernel());
-                CHECKED( pthread_mutex_lock(&lock) );
-                        while(val < 1) {
-                                pthread_cond_wait(&cond, &lock);
+                        }
-                        val -= 1;
-                CHECKED( pthread_mutex_unlock(&lock) );
+        }
-        static inline bool post(__bin_sem_t & this) with( this ) {
-                bool needs_signal = false;
-                CHECKED( pthread_mutex_lock(&lock) );
-                        if(val < 1) {
-                                val += 1;
-                                pthread_cond_signal(&cond);
-                                needs_signal = true;
+                        }
-                CHECKED( pthread_mutex_unlock(&lock) );
-                return needs_signal;
+        }
-        #undef CHECKED
-        struct $thread;
-        extern void park( void );
-        extern void unpark( struct $thread * this );
-        static inline struct $thread * active_thread ();
-        // Semaphore which only supports a single thread
-        struct single_sem {
-                struct $thread * volatile ptr;
-        };
-        static inline {
-                void  ?{}(single_sem & this) {
-                        this.ptr = 0p;
+                }
-                void ^?{}(single_sem &) {}
-                bool wait(single_sem & this) {
-                        for() {
-                                struct $thread * expected = this.ptr;
-                                if(expected == 1p) {
-                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                                return false;
+                                        }
+                                }
-                                else {
-                                        /* paranoid */ verify( expected == 0p );
-                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                                park();
-                                                return true;
+                                        }
+                                }
+                        }
+                }
-                bool post(single_sem & this) {
-                        for() {
-                                struct $thread * expected = this.ptr;
-                                if(expected == 1p) return false;
-                                if(expected == 0p) {
-                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                                return false;
+                                        }
+                                }
-                                else {
-                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                                unpark( expected );
-                                                return true;
+                                        }
+                                }
+                        }
+                }
+        }
-        // Synchronozation primitive which only supports a single thread and one post
-        // Similar to a binary semaphore with a 'one shot' semantic
-        // is expected to be discarded after each party call their side
-        struct oneshot {
-                // Internal state :
-                //     0p     : is initial state (wait will block)
-                //     1p     : fulfilled (wait won't block)
-                // any thread : a thread is currently waiting
-                struct $thread * volatile ptr;
-        };
-        static inline {
-                void  ?{}(oneshot & this) {
-                        this.ptr = 0p;
+                }
-                void ^?{}(oneshot &) {}
-                // Wait for the post, return immidiately if it already happened.
-                // return true if the thread was parked
-                bool wait(oneshot & this) {
-                        for() {
-                                struct $thread * expected = this.ptr;
-                                if(expected == 1p) return false;
-                                /* paranoid */ verify( expected == 0p );
-                                if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                        park();
-                                        /* paranoid */ verify( this.ptr == 1p );
-                                        return true;
+                                }
+                        }
+                }
-                // Mark as fulfilled, wake thread if needed
-                // return true if a thread was unparked
-                bool post(oneshot & this) {
-                        struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
-                        if( got == 0p ) return false;
-                        unpark( got );
-                        return true;
+                }
+        }
-        // base types for future to build upon
-        // It is based on the 'oneshot' type to allow multiple futures
-        // to block on the same instance, permitting users to block a single
-        // thread on "any of" [a given set of] futures.
-        // does not support multiple threads waiting on the same future
-        struct future_t {
-                // Internal state :
-                //     0p      : is initial state (wait will block)
-                //     1p      : fulfilled (wait won't block)
-                //     2p      : in progress ()
-                //     3p      : abandoned, server should delete
-                // any oneshot : a context has been setup to wait, a thread could wait on it
-                struct oneshot * volatile ptr;
-        };
-        static inline {
-                void  ?{}(future_t & this) {
-                        this.ptr = 0p;
+                }
-                void ^?{}(future_t &) {}
-                void reset(future_t & this) {
-                        // needs to be in 0p or 1p
-                        __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                }
-                // check if the future is available
-                bool available( future_t & this ) {
-                        return this.ptr == 1p;
+                }
-                // Prepare the future to be waited on
-                // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
-                bool setup( future_t & this, oneshot & wait_ctx ) {
-                        /* paranoid */ verify( wait_ctx.ptr == 0p );
-                        // The future needs to set the wait context
-                        for() {
-                                struct oneshot * expected = this.ptr;
-                                // Is the future already fulfilled?
-                                if(expected == 1p) return false; // Yes, just return false (didn't block)
-                                // The future is not fulfilled, try to setup the wait context
-                                /* paranoid */ verify( expected == 0p );
-                                if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                        return true;
+                                }
+                        }
+                }
-                // Stop waiting on a future
-                // When multiple futures are waited for together in "any of" pattern
-                // futures that weren't fulfilled before the thread woke up
-                // should retract the wait ctx
-                // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
-                void retract( future_t & this, oneshot & wait_ctx ) {
-                        // Remove the wait context
-                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
-                        // got == 0p: future was never actually setup, just return
-                        if( got == 0p ) return;
-                        // got == wait_ctx: since fulfil does an atomic_swap,
-                        // if we got back the original then no one else saw context
-                        // It is safe to delete (which could happen after the return)
-                        if( got == &wait_ctx ) return;
-                        // got == 1p: the future is ready and the context was fully consumed
-                        // the server won't use the pointer again
-                        // It is safe to delete (which could happen after the return)
-                        if( got == 1p ) return;
-                        // got == 2p: the future is ready but the context hasn't fully been consumed
-                        // spin until it is safe to move on
-                        if( got == 2p ) {
-                                while( this.ptr != 1p ) Pause();
-                                return;
+                        }
-                        // got == any thing else, something wen't wrong here, abort
-                        abort("Future in unexpected state");
+                }
-                // Mark the future as abandoned, meaning it will be deleted by the server
-                bool abandon( future_t & this ) {
-                        /* paranoid */ verify( this.ptr != 3p );
-                        // Mark the future as abandonned
-                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
-                        // If the future isn't already fulfilled, let the server delete it
-                        if( got == 0p ) return false;
-                        // got == 2p: the future is ready but the context hasn't fully been consumed
-                        // spin until it is safe to move on
-                        if( got == 2p ) {
-                                while( this.ptr != 1p ) Pause();
-                                got = 1p;
+                        }
-                        // The future is completed delete it now
-                        /* paranoid */ verify( this.ptr != 1p );
-                        free( &this );
-                        return true;
+                }
-                // from the server side, mark the future as fulfilled
-                // delete it if needed
-                bool fulfil( future_t & this ) {
-                        for() {
-                                struct oneshot * expected = this.ptr;
-                                // was this abandoned?
-                                #if defined(__GNUC__) && __GNUC__ >= 7
-                                        #pragma GCC diagnostic push
-                                        #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
-                                #endif
-                                        if( expected == 3p ) { free( &this ); return false; }
-                                #if defined(__GNUC__) && __GNUC__ >= 7
-                                        #pragma GCC diagnostic pop
-                                #endif
-                                /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
-                                /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
-                                // If there is a wait context, we need to consume it and mark it as consumed after
-                                // If there is no context then we can skip the in progress phase
-                                struct oneshot * want = expected == 0p ? 1p : 2p;
-                                if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                        if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
-                                        bool ret = post( *expected );
-                                        __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
-                                        return ret;
+                                }
+                        }
+                }
-                // Wait for the future to be fulfilled
-                bool wait( future_t & this ) {
-                        oneshot temp;
-                        if( !setup(this, temp) ) return false;
-                        // Wait context is setup, just wait on it
-                        bool ret = wait( temp );
-                        // Wait for the future to tru
-                        while( this.ptr == 2p ) Pause();
-                        // Make sure the state makes sense
-                        // Should be fulfilled, could be in progress but it's out of date if so
-                        // since if that is the case, the oneshot was fulfilled (unparking this thread)
-                        // and the oneshot should not be needed any more
-                        __attribute__((unused)) struct oneshot * was = this.ptr;
-                        /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
-                        // Mark the future as fulfilled, to be consistent
-                        // with potential calls to avail
-                        // this.ptr = 1p;
-                        return ret;
+                }
+        }
 #endif

libcfa/src/bits/queue.hfa

-              r342af53
+              r8e4aa05
 // instead of being null.
 forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
         struct Queue {
                 inline Collection;                                                              // Plan 9 inheritance
 …
 } // distribution
 forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
         struct QueueIter {
                 inline ColIter;                                                                 // Plan 9 inheritance

libcfa/src/bits/sequence.hfa

-              r342af53
+              r8e4aa05
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// bits/sequence.hfa -- PUBLIC
+// Intrusive doubly-linked list
+//
+// Author           : Colby Alexander Parsons & Peter A. Buhr
+// Created On       : Thu Jan 21 19:46:50 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
 #pragma once
 …
 struct Seqable {
         __cfa_anonymous_object(Colable);
+        struct Seqable * back;                                                                          // pointer to previous node in the list
+        // pointer to previous node in the list
+        struct Seqable * back;
 };
 …
         // PUBLIC
         void ?{}( Seqable & sq ) with( sq ) {
+        void ?{}( Seqable & sq ) {
                 ((Colable &)sq){};
                 back = 0p;
+                sq.back = 0p;
         } // post: ! listed()
 …
                 return sq->back;
+        }
-        // // wrappers to make Collection have T
-        // forall( dtype T ) {
-        //      T *& Back( T * n ) {
-        //              return (T *)Back( (Seqable *)n );
-        //      }
-        // } // distribution
 } // distribution
 …
 // and the back field of the last node points at the first node (circular).
 forall( dtype T | { T *& Back ( T * ); T *& Next ( T * ); } ) {
+forall( T & ) {
         struct Sequence {
+                inline Collection;                                                              // Plan 9 inheritance
+                // Plan 9 inheritance
+                inline Collection;
         };
         static inline {
+                void ?{}( Sequence(T) &, const Sequence(T) & ) = void; // no copy
+                Sequence(T) & ?=?( const Sequence(T) & ) = void; // no assignment
+                void ?{}( Sequence(T) & s ) with( s ) {
+                        ((Collection &)s){};
+                }       // post: isEmpty()
+        }
+        static inline forall(| { T *& Back ( T * ); T *& Next ( T * ); }) {
                 // wrappers to make Collection have T
                 T & head( Sequence(T) & s ) with( s ) {
                         return *(T *)head( (Collection &)s );
                 } // post: empty() & head() == 0 | !empty() & head() in *s
-                void ?{}( Sequence(T) &, const Sequence(T) & ) = void; // no copy
-                Sequence(T) & ?=?( const Sequence(T) & ) = void; // no assignment
-                void ?{}( Sequence(T) & s ) with( s ) {
-                        ((Collection &)s){};
-                }       // post: isEmpty()
                 // Return a pointer to the last sequence element, without removing it.
 …
                         return n;
                 } // post: n->listed() & *n in *s & succ(n) == bef
                 // pre: n->listed() & *n in *s
                 T & remove( Sequence(T) & s, T & n ) with( s ) { // O(1)
 …
 } // distribution
 forall( dtype T | { T *& Back ( T * ); T *& Next ( T * ); } ) {
+forall( T & | { T *& Back ( T * ); T *& Next ( T * ); } ) {
         // SeqIter(T) is used to iterate over a Sequence(T) in head-to-tail order.
         struct SeqIter {
 …
         static inline {
                 void ?{}( SeqIterRev(T) & si ) with( si ) {
+                void ?{}( SeqIterRev(T) & si ) with( si ) {
                         ((ColIter &)si){};
                         seq = 0p;
 …
                 // Create a iterator active in sequence s.
                 void ?{}( SeqIterRev(T) & si, Sequence(T) & s ) with( si ) {
+                void ?{}( SeqIterRev(T) & si, Sequence(T) & s ) with( si ) {
                         ((ColIter &)si){};
                         seq = &s;
 …
                 } // post: elts = null
                 void ?{}( SeqIterRev(T) & si, Sequence(T) & s, T & start ) with( si ) {
+                void ?{}( SeqIterRev(T) & si, Sequence(T) & s, T & start ) with( si ) {
                         ((ColIter &)si){};
                         seq = &s;

libcfa/src/bits/stack.hfa

-              r342af53
+              r8e4aa05
 // instead of being null.
 forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
         struct Stack {
                 inline Collection;                                                              // Plan 9 inheritance
 …
 // order returned by drop().
 forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
         struct StackIter {
                 inline ColIter;                                                                 // Plan 9 inheritance

libcfa/src/common.cfa

r342af53	r8e4aa05
23	23	[ long int, long int ] div( long int num, long int denom ) { ldiv_t qr = ldiv( num, denom ); return [ qr.quot, qr.rem ]; }
24	24	[ long long int, long long int ] div( long long int num, long long int denom ) { lldiv_t qr = lldiv( num, denom ); return [ qr.quot, qr.rem ]; }
25		forall( ~~otype~~ T \| { T ?/?( T, T ); T ?%?( T, T ); } )
	25	forall( T \| { T ?/?( T, T ); T ?%?( T, T ); } )
26	26	[ T, T ] div( T num, T denom ) { return [ num / denom, num % denom ]; }
27	27

libcfa/src/common.hfa

-              r342af53
+              r8e4aa05
 [ long int, long int ] div( long int num, long int denom );
 [ long long int, long long int ] div( long long int num, long long int denom );
 forall( otype T | { T ?/?( T, T ); T ?%?( T, T ); } )
+forall( T | { T ?/?( T, T ); T ?%?( T, T ); } )
 [ T, T ] div( T num, T demon );
 …
 } // distribution
 forall( otype T | { void ?{}( T &, zero_t ); int ?<?( T, T ); T -?( T ); } )
+forall( T | { void ?{}( T &, zero_t ); int ?<?( T, T ); T -?( T ); } )
 T abs( T );
 …
         intptr_t min( intptr_t t1, intptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
         uintptr_t min( uintptr_t t1, uintptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
         forall( otype T | { int ?<?( T, T ); } )
+        forall( T | { int ?<?( T, T ); } )
         T min( T t1, T t2 ) { return t1 < t2 ? t1 : t2; }
 …
         intptr_t max( intptr_t t1, intptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
         uintptr_t max( uintptr_t t1, uintptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
         forall( otype T | { int ?>?( T, T ); } )
+        forall( T | { int ?>?( T, T ); } )
         T max( T t1, T t2 ) { return t1 > t2 ? t1 : t2; }
         forall( otype T | { T min( T, T ); T max( T, T ); } )
+        forall( T | { T min( T, T ); T max( T, T ); } )
         T clamp( T value, T min_val, T max_val ) { return max( min_val, min( value, max_val ) ); }
         forall( otype T )
+        forall( T )
         void swap( T & v1, T & v2 ) { T temp = v1; v1 = v2; v2 = temp; }
 } // distribution

libcfa/src/concurrency/coroutine.cfa

-              r342af53
+              r8e4aa05
 //-----------------------------------------------------------------------------
 FORALL_DATA_INSTANCE(CoroutineCancelled, (dtype coroutine_t), (coroutine_t))
 forall(dtype T)
+FORALL_DATA_INSTANCE(CoroutineCancelled, (coroutine_t &), (coroutine_t))
+forall(T &)
 void mark_exception(CoroutineCancelled(T) *) {}
 forall(dtype T)
+forall(T &)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src) {
         dst->virtual_table = src->virtual_table;
 …
+}
 forall(dtype T)
+forall(T &)
 const char * msg(CoroutineCancelled(T) *) {
         return "CoroutineCancelled(...)";
 …
 // This code should not be inlined. It is the error path on resume.
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc ) {
         verify( desc->cancellation );
 …
 // Part of the Public API
 // Not inline since only ever called once per coroutine
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void prime(T& cor) {
         $coroutine* this = get_coroutine(cor);
 …
 void __stack_clean  ( __stack_info_t * this ) {
-        size_t size = ((intptr_t)this->storage->base) - ((intptr_t)this->storage->limit) + sizeof(__stack_t);
         void * storage = this->storage->limit;
         #if CFA_COROUTINE_USE_MMAP
+                size_t size = ((intptr_t)this->storage->base) - ((intptr_t)this->storage->limit) + sizeof(__stack_t);
                 storage = (void *)(((intptr_t)storage) - __page_size);
                 if(munmap(storage, size + __page_size) == -1) {

libcfa/src/concurrency/coroutine.hfa

-              r342af53
+              r8e4aa05
 //-----------------------------------------------------------------------------
 // Exception thrown from resume when a coroutine stack is cancelled.
 FORALL_DATA_EXCEPTION(CoroutineCancelled, (dtype coroutine_t), (coroutine_t)) (
+FORALL_DATA_EXCEPTION(CoroutineCancelled, (coroutine_t &), (coroutine_t)) (
         coroutine_t * the_coroutine;
         exception_t * the_exception;
 );
 forall(dtype T)
+forall(T &)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src);
 forall(dtype T)
+forall(T &)
 const char * msg(CoroutineCancelled(T) *);
 …
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
 trait is_coroutine(dtype T | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
+trait is_coroutine(T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
         void main(T & this);
         $coroutine * get_coroutine(T & this);
 …
 //-----------------------------------------------------------------------------
 // Public coroutine API
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void prime(T & cor);
 …
         void __cfactx_invoke_coroutine(void (*main)(void *), void * this);
         forall(dtype T)
+        forall(T &)
         void __cfactx_start(void (*main)(T &), struct $coroutine * cor, T & this, void (*invoke)(void (*main)(void *), void *));
 …
+}
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc );
 // Resume implementation inlined for performance
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 static inline T & resume(T & cor) {
         // optimization : read TLS once and reuse it

libcfa/src/concurrency/future.hfa

-              r342af53
+              r8e4aa05
 #include "monitor.hfa"
 forall( otype T ) {
+forall( T ) {
         struct future {
                 inline future_t;
 …
+}
 forall( otype T ) {
+forall( T ) {
         monitor multi_future {
                 inline future_t;

libcfa/src/concurrency/io.cfa

-              r342af53
+              r8e4aa05
         extern "C" {
                 #include <sys/syscall.h>
+                #include <sys/eventfd.h>
                 #include <linux/io_uring.h>
 …
         #include "io/types.hfa"
         static const char * opcodes[] = {
+        __attribute__((unused)) static const char * opcodes[] = {
                 "OP_NOP",
                 "OP_READV",
 …
         };
+        // returns true of acquired as leader or second leader
+        static inline bool try_lock( __leaderlock_t & this ) {
+                const uintptr_t thrd = 1z | (uintptr_t)active_thread();
+                bool block;
+                disable_interrupts();
+                for() {
+                        struct $thread * expected = this.value;
+                        if( 1p != expected && 0p != expected ) {
+                                /* paranoid */ verify( thrd != (uintptr_t)expected ); // We better not already be the next leader
+                                enable_interrupts( __cfaabi_dbg_ctx );
+                                return false;
+                        }
+                        struct $thread * desired;
+                        if( 0p == expected ) {
+                                // If the lock isn't locked acquire it, no need to block
+                                desired = 1p;
+                                block = false;
+                        }
+                        else {
+                                // If the lock is already locked try becomming the next leader
+                                desired = (struct $thread *)thrd;
+                                block = true;
+                        }
+                        if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+                }
+                if( block ) {
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        park();
+                        disable_interrupts();
+                }
+                return true;
+        }
+        static inline bool next( __leaderlock_t & this ) {
+        static $io_context * __ioarbiter_allocate( $io_arbiter & mutex this, processor *, __u32 idxs[], __u32 want );
+        static void __ioarbiter_submit( $io_arbiter & mutex this, $io_context * , __u32 idxs[], __u32 have, bool lazy );
+        static void __ioarbiter_flush ( $io_arbiter & mutex this, $io_context * );
+        static inline void __ioarbiter_notify( $io_context & ctx );
+//=============================================================================================
+// I/O Polling
+//=============================================================================================
+        static inline unsigned __flush( struct $io_context & );
+        static inline __u32 __release_sqes( struct $io_context & );
+        void __cfa_io_drain( processor * proc ) {
                 /* paranoid */ verify( ! __preemption_enabled() );
+                struct $thread * nextt;
+                for() {
+                        struct $thread * expected = this.value;
+                        /* paranoid */ verify( (1 & (uintptr_t)expected) == 1 ); // The lock better be locked
+                        struct $thread * desired;
+                        if( 1p == expected ) {
+                                // No next leader, just unlock
+                                desired = 0p;
+                                nextt   = 0p;
+                        }
+                        else {
+                                // There is a next leader, remove but keep locked
+                                desired = 1p;
+                                nextt   = (struct $thread *)(~1z & (uintptr_t)expected);
+                        }
+                        if( __atomic_compare_exchange_n(&this.value, &expected, desired, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) ) break;
+                }
+                if(nextt) {
+                        unpark( nextt );
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        return true;
+                }
+                enable_interrupts( __cfaabi_dbg_ctx );
+                return false;
+        }
+//=============================================================================================
+// I/O Syscall
+//=============================================================================================
+        static int __io_uring_enter( struct __io_data & ring, unsigned to_submit, bool get ) {
+                bool need_sys_to_submit = false;
+                bool need_sys_to_complete = false;
+                unsigned flags = 0;
+                TO_SUBMIT:
+                if( to_submit > 0 ) {
+                        if( !(ring.ring_flags & IORING_SETUP_SQPOLL) ) {
+                                need_sys_to_submit = true;
+                                break TO_SUBMIT;
+                        }
+                        if( (*ring.submit_q.flags) & IORING_SQ_NEED_WAKEUP ) {
+                                need_sys_to_submit = true;
+                                flags |= IORING_ENTER_SQ_WAKEUP;
+                        }
+                }
+                if( get && !(ring.ring_flags & IORING_SETUP_SQPOLL) ) {
+                        flags |= IORING_ENTER_GETEVENTS;
+                        if( (ring.ring_flags & IORING_SETUP_IOPOLL) ) {
+                                need_sys_to_complete = true;
+                        }
+                }
+                int ret = 0;
+                if( need_sys_to_submit || need_sys_to_complete ) {
+                        __cfadbg_print_safe(io_core, "Kernel I/O : IO_URING enter %d %u %u\n", ring.fd, to_submit, flags);
+                        ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, 0, flags, (sigset_t *)0p, _NSIG / 8);
+                        if( ret < 0 ) {
+                                switch((int)errno) {
+                                case EAGAIN:
+                                case EINTR:
+                                        ret = -1;
+                                        break;
+                                default:
+                                        abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+                                }
+                        }
+                }
+                // Memory barrier
+                __atomic_thread_fence( __ATOMIC_SEQ_CST );
+                return ret;
+        }
+//=============================================================================================
+// I/O Polling
+//=============================================================================================
+        static unsigned __collect_submitions( struct __io_data & ring );
+        static __u32 __release_consumed_submission( struct __io_data & ring );
+        static inline void __clean( volatile struct io_uring_sqe * sqe );
+        // Process a single completion message from the io_uring
+        // This is NOT thread-safe
+        static inline void process( volatile struct io_uring_cqe & cqe ) {
+                struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
+                __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
+                fulfil( *future, cqe.res );
+        }
+        static [int, bool] __drain_io( & struct __io_data ring ) {
+                /* paranoid */ verify( ! __preemption_enabled() );
+                unsigned to_submit = 0;
+                if( ring.poller_submits ) {
+                        // If the poller thread also submits, then we need to aggregate the submissions which are ready
+                        to_submit = __collect_submitions( ring );
+                }
+                int ret = __io_uring_enter(ring, to_submit, true);
+                if( ret < 0 ) {
+                        return [0, true];
+                }
+                // update statistics
+                if (to_submit > 0) {
+                        __STATS__( true,
+                                if( to_submit > 0 ) {
+                                        io.submit_q.submit_avg.rdy += to_submit;
+                                        io.submit_q.submit_avg.csm += ret;
+                                        io.submit_q.submit_avg.cnt += 1;
+                                }
+                        )
+                }
+                __atomic_thread_fence( __ATOMIC_SEQ_CST );
+                // Release the consumed SQEs
+                __release_consumed_submission( ring );
+                /* paranoid */ verify( proc );
+                /* paranoid */ verify( proc->io.ctx );
                 // Drain the queue
+                unsigned head = *ring.completion_q.head;
+                unsigned tail = *ring.completion_q.tail;
+                const __u32 mask = *ring.completion_q.mask;
+                // Nothing was new return 0
+                if (head == tail) {
+                        return [0, to_submit > 0];
+                }
+                $io_context * ctx = proc->io.ctx;
+                unsigned head = *ctx->cq.head;
+                unsigned tail = *ctx->cq.tail;
+                const __u32 mask = *ctx->cq.mask;
                 __u32 count = tail - head;
+                /* paranoid */ verify( count != 0 );
+                __STATS__( false, io.calls.drain++; io.calls.completed += count; )
                 for(i; count) {
                         unsigned idx = (head + i) & mask;
                         volatile struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
+                        volatile struct io_uring_cqe & cqe = ctx->cq.cqes[idx];
                         /* paranoid */ verify(&cqe);
+                        process( cqe );
+                }
+                        struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
+                        __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
+                        fulfil( *future, cqe.res );
+                }
+                __cfadbg_print_safe(io, "Kernel I/O : %u completed\n", count);
                 // Mark to the kernel that the cqe has been seen
                 // Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
+                __atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_SEQ_CST );
+                return [count, count > 0 || to_submit > 0];
+        }
+        void main( $io_ctx_thread & this ) {
+                __ioctx_register( this );
+                __cfadbg_print_safe(io_core, "Kernel I/O : IO poller %d (%p) ready\n", this.ring->fd, &this);
+                const int reset_cnt = 5;
+                int reset = reset_cnt;
+                // Then loop until we need to start
+                LOOP:
+                while(!__atomic_load_n(&this.done, __ATOMIC_SEQ_CST)) {
+                        // Drain the io
+                        int count;
+                        bool again;
+                        disable_interrupts();
+                                [count, again] = __drain_io( *this.ring );
+                                if(!again) reset--;
+                __atomic_store_n( ctx->cq.head, head + count, __ATOMIC_SEQ_CST );
+                /* paranoid */ verify( ! __preemption_enabled() );
+                return;
+        }
+        void __cfa_io_flush( processor * proc ) {
+                /* paranoid */ verify( ! __preemption_enabled() );
+                /* paranoid */ verify( proc );
+                /* paranoid */ verify( proc->io.ctx );
+                $io_context & ctx = *proc->io.ctx;
+                if(!ctx.ext_sq.empty) {
+                        __ioarbiter_flush( *ctx.arbiter, &ctx );
+                }
+                __STATS__( true, io.calls.flush++; )
+                int ret = syscall( __NR_io_uring_enter, ctx.fd, ctx.sq.to_submit, 0, 0, (sigset_t *)0p, _NSIG / 8);
+                if( ret < 0 ) {
+                        switch((int)errno) {
+                        case EAGAIN:
+                        case EINTR:
+                        case EBUSY:
                                 // Update statistics
+                                __STATS__( true,
+                                        io.complete_q.completed_avg.val += count;
+                                        io.complete_q.completed_avg.cnt += 1;
+                                )
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        // If we got something, just yield and check again
+                        if(reset > 1) {
+                                yield();
+                                continue LOOP;
+                                __STATS__( false, io.calls.errors.busy ++; )
+                                return;
+                        default:
+                                abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+                        }
+                        // We alread failed to find completed entries a few time.
+                        if(reset == 1) {
+                                // Rearm the context so it can block
+                                // but don't block right away
+                                // we need to retry one last time in case
+                                // something completed *just now*
+                                __ioctx_prepare_block( this );
+                                continue LOOP;
+                        }
+                                __STATS__( false,
+                                        io.complete_q.blocks += 1;
+                                )
+                                __cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %d (%p)\n", this.ring->fd, &this);
+                                // block this thread
+                                wait( this.sem );
+                        // restore counter
+                        reset = reset_cnt;
+                }
+                __cfadbg_print_safe(io_core, "Kernel I/O : Fast poller %d (%p) stopping\n", this.ring->fd, &this);
+                }
+                __cfadbg_print_safe(io, "Kernel I/O : %u submitted to io_uring %d\n", ret, ctx.fd);
+                __STATS__( true, io.calls.submitted += ret; )
+                /* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
+                /* paranoid */ verify( ctx.sq.to_submit >= ret );
+                ctx.sq.to_submit -= ret;
+                /* paranoid */ verify( ctx.sq.to_submit <= *ctx.sq.num );
+                // Release the consumed SQEs
+                __release_sqes( ctx );
+                /* paranoid */ verify( ! __preemption_enabled() );
+                ctx.proc->io.pending = false;
+        }
 …
 //         head and tail must be fully filled and shouldn't ever be touched again.
 //
+        //=============================================================================================
+        // Allocation
+        // for user's convenience fill the sqes from the indexes
+        static inline void __fill(struct io_uring_sqe * out_sqes[], __u32 want, __u32 idxs[], struct $io_context * ctx)  {
+                struct io_uring_sqe * sqes = ctx->sq.sqes;
+                for(i; want) {
+                        __cfadbg_print_safe(io, "Kernel I/O : filling loop\n");
+                        out_sqes[i] = &sqes[idxs[i]];
+                }
+        }
+        // Try to directly allocate from the a given context
+        // Not thread-safe
+        static inline bool __alloc(struct $io_context * ctx, __u32 idxs[], __u32 want) {
+                __sub_ring_t & sq = ctx->sq;
+                const __u32 mask  = *sq.mask;
+                __u32 fhead = sq.free_ring.head;    // get the current head of the queue
+                __u32 ftail = sq.free_ring.tail;    // get the current tail of the queue
+                // If we don't have enough sqes, fail
+                if((ftail - fhead) < want) { return false; }
+                // copy all the indexes we want from the available list
+                for(i; want) {
+                        __cfadbg_print_safe(io, "Kernel I/O : allocating loop\n");
+                        idxs[i] = sq.free_ring.array[(fhead + i) & mask];
+                }
+                // Advance the head to mark the indexes as consumed
+                __atomic_store_n(&sq.free_ring.head, fhead + want, __ATOMIC_RELEASE);
+                // return success
+                return true;
+        }
         // Allocate an submit queue entry.
 …
         // for convenience, return both the index and the pointer to the sqe
         // sqe == &sqes[idx]
+        [* volatile struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
+                /* paranoid */ verify( data != 0 );
+                // Prepare the data we need
+                __attribute((unused)) int len   = 0;
+                __attribute((unused)) int block = 0;
+                __u32 cnt = *ring.submit_q.num;
+                __u32 mask = *ring.submit_q.mask;
+                __u32 off = thread_rand();
+                // Loop around looking for an available spot
+                for() {
+                        // Look through the list starting at some offset
+                        for(i; cnt) {
+                                __u64 expected = 3;
+                                __u32 idx = (i + off) & mask; // Get an index from a random
+                                volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
+                                volatile __u64 * udata = &sqe->user_data;
+                                // Allocate the entry by CASing the user_data field from 0 to the future address
+                                if( *udata == expected &&
+                                        __atomic_compare_exchange_n( udata, &expected, data, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) )
+                                {
+                                        // update statistics
+                                        __STATS__( false,
+                                                io.submit_q.alloc_avg.val   += len;
+                                                io.submit_q.alloc_avg.block += block;
+                                                io.submit_q.alloc_avg.cnt   += 1;
+                                        )
+                                        // debug log
+                                        __cfadbg_print_safe( io, "Kernel I/O : allocated [%p, %u] for %p (%p)\n", sqe, idx, active_thread(), (void*)data );
+                                        // Success return the data
+                                        return [sqe, idx];
+                                }
+                                verify(expected != data);
+                                // This one was used
+                                len ++;
+                        }
+                        block++;
+                        abort( "Kernel I/O : all submit queue entries used, yielding\n" );
+                        yield();
+                }
+        }
+        static inline __u32 __submit_to_ready_array( struct __io_data & ring, __u32 idx, const __u32 mask ) {
+                /* paranoid */ verify( idx <= mask   );
+                /* paranoid */ verify( idx != -1ul32 );
+                // We need to find a spot in the ready array
+                __attribute((unused)) int len   = 0;
+                __attribute((unused)) int block = 0;
+                __u32 ready_mask = ring.submit_q.ready_cnt - 1;
+                __u32 off = thread_rand();
+                __u32 picked;
+                LOOKING: for() {
+                        for(i; ring.submit_q.ready_cnt) {
+                                picked = (i + off) & ready_mask;
+                                __u32 expected = -1ul32;
+                                if( __atomic_compare_exchange_n( &ring.submit_q.ready[picked], &expected, idx, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) ) {
+                                        break LOOKING;
+                                }
+                                verify(expected != idx);
+                                len ++;
+                        }
+                        block++;
+                        __u32 released = __release_consumed_submission( ring );
+                        if( released == 0 ) {
+                                yield();
+                        }
+                }
+                // update statistics
+                __STATS__( false,
+                        io.submit_q.look_avg.val   += len;
+                        io.submit_q.look_avg.block += block;
+                        io.submit_q.look_avg.cnt   += 1;
+                )
+                return picked;
+        }
+        void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1))) {
+                __io_data & ring = *ctx->thrd.ring;
+        struct $io_context * cfa_io_allocate(struct io_uring_sqe * sqes[], __u32 idxs[], __u32 want) {
+                __cfadbg_print_safe(io, "Kernel I/O : attempting to allocate %u\n", want);
+                disable_interrupts();
+                processor * proc = __cfaabi_tls.this_processor;
+                $io_context * ctx = proc->io.ctx;
+                /* paranoid */ verify( __cfaabi_tls.this_processor );
+                /* paranoid */ verify( ctx );
+                __cfadbg_print_safe(io, "Kernel I/O : attempting to fast allocation\n");
+                // We can proceed to the fast path
+                if( __alloc(ctx, idxs, want) ) {
+                        // Allocation was successful
+                        __STATS__( true, io.alloc.fast += 1; )
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        __cfadbg_print_safe(io, "Kernel I/O : fast allocation successful from ring %d\n", ctx->fd);
+                        __fill( sqes, want, idxs, ctx );
+                        return ctx;
+                }
+                // The fast path failed, fallback
+                __STATS__( true, io.alloc.fail += 1; )
+                // Fast path failed, fallback on arbitration
+                __STATS__( true, io.alloc.slow += 1; )
+                enable_interrupts( __cfaabi_dbg_ctx );
+                $io_arbiter * ioarb = proc->cltr->io.arbiter;
+                /* paranoid */ verify( ioarb );
+                __cfadbg_print_safe(io, "Kernel I/O : falling back on arbiter for allocation\n");
+                struct $io_context * ret = __ioarbiter_allocate(*ioarb, proc, idxs, want);
+                __cfadbg_print_safe(io, "Kernel I/O : slow allocation completed from ring %d\n", ret->fd);
+                __fill( sqes, want, idxs,ret );
+                return ret;
+        }
+        //=============================================================================================
+        // submission
+        static inline void __submit( struct $io_context * ctx, __u32 idxs[], __u32 have, bool lazy) {
+                // We can proceed to the fast path
+                // Get the right objects
+                __sub_ring_t & sq = ctx->sq;
+                const __u32 mask  = *sq.mask;
+                __u32 tail = *sq.kring.tail;
+                // Add the sqes to the array
+                for( i; have ) {
+                        __cfadbg_print_safe(io, "Kernel I/O : __submit loop\n");
+                        sq.kring.array[ (tail + i) & mask ] = idxs[i];
+                }
+                // Make the sqes visible to the submitter
+                __atomic_store_n(sq.kring.tail, tail + have, __ATOMIC_RELEASE);
+                sq.to_submit++;
+                ctx->proc->io.pending = true;
+                ctx->proc->io.dirty   = true;
+                if(sq.to_submit > 30 || !lazy) {
+                        __cfa_io_flush( ctx->proc );
+                }
+        }
+        void cfa_io_submit( struct $io_context * inctx, __u32 idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1))) {
+                __cfadbg_print_safe(io, "Kernel I/O : attempting to submit %u (%s)\n", have, lazy ? "lazy" : "eager");
+                disable_interrupts();
+                processor * proc = __cfaabi_tls.this_processor;
+                $io_context * ctx = proc->io.ctx;
+                /* paranoid */ verify( __cfaabi_tls.this_processor );
+                /* paranoid */ verify( ctx );
+                // Can we proceed to the fast path
+                if( ctx == inctx )              // We have the right instance?
+                {
+                        __attribute__((unused)) volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
+                        __cfadbg_print_safe( io,
+                                "Kernel I/O : submitting %u (%p) for %p\n"
+                                "    data: %p\n"
+                                "    opcode: %s\n"
+                                "    fd: %d\n"
+                                "    flags: %d\n"
+                                "    prio: %d\n"
+                                "    off: %p\n"
+                                "    addr: %p\n"
+                                "    len: %d\n"
+                                "    other flags: %d\n"
+                                "    splice fd: %d\n"
+                                "    pad[0]: %llu\n"
+                                "    pad[1]: %llu\n"
+                                "    pad[2]: %llu\n",
+                                idx, sqe,
+                                active_thread(),
+                                (void*)sqe->user_data,
+                                opcodes[sqe->opcode],
+                                sqe->fd,
+                                sqe->flags,
+                                sqe->ioprio,
+                                sqe->off,
+                                sqe->addr,
+                                sqe->len,
+                                sqe->accept_flags,
+                                sqe->splice_fd_in,
+                                sqe->__pad2[0],
+                                sqe->__pad2[1],
+                                sqe->__pad2[2]
+                        );
+                }
+                // Get now the data we definetely need
+                volatile __u32 * const tail = ring.submit_q.tail;
+                const __u32 mask  = *ring.submit_q.mask;
+                // There are 2 submission schemes, check which one we are using
+                if( ring.poller_submits ) {
+                        // If the poller thread submits, then we just need to add this to the ready array
+                        __submit_to_ready_array( ring, idx, mask );
+                        post( ctx->thrd.sem );
+                        __cfadbg_print_safe( io, "Kernel I/O : Added %u to ready for %p\n", idx, active_thread() );
+                }
+                else if( ring.eager_submits ) {
+                        __u32 picked = __submit_to_ready_array( ring, idx, mask );
+                        #if defined(LEADER_LOCK)
+                                if( !try_lock(ring.submit_q.submit_lock) ) {
+                                        __STATS__( false,
+                                                io.submit_q.helped += 1;
+                                        )
+                                        return;
+                                }
+                                /* paranoid */ verify( ! __preemption_enabled() );
+                                __STATS__( true,
+                                        io.submit_q.leader += 1;
+                                )
+                        #else
+                                for() {
+                                        yield();
+                                        if( try_lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2) ) {
+                                                __STATS__( false,
+                                                        io.submit_q.leader += 1;
+                                                )
+                                                break;
+                                        }
+                                        // If some one else collected our index, we are done
+                                        #warning ABA problem
+                                        if( ring.submit_q.ready[picked] != idx ) {
+                                                __STATS__( false,
+                                                        io.submit_q.helped += 1;
+                                                )
+                                                return;
+                                        }
+                                        __STATS__( false,
+                                                io.submit_q.busy += 1;
+                                        )
+                                }
+                        #endif
+                        // We got the lock
+                        // Collect the submissions
+                        unsigned to_submit = __collect_submitions( ring );
+                        // Actually submit
+                        int ret = __io_uring_enter( ring, to_submit, false );
+                        #if defined(LEADER_LOCK)
+                                /* paranoid */ verify( ! __preemption_enabled() );
+                                next(ring.submit_q.submit_lock);
+                        #else
+                                unlock(ring.submit_q.submit_lock);
+                        #endif
+                        if( ret < 0 ) {
+                                return;
+                        }
+                        // Release the consumed SQEs
+                        __release_consumed_submission( ring );
+                        // update statistics
+                        __STATS__( false,
+                                io.submit_q.submit_avg.rdy += to_submit;
+                                io.submit_q.submit_avg.csm += ret;
+                                io.submit_q.submit_avg.cnt += 1;
+                        )
+                        __cfadbg_print_safe( io, "Kernel I/O : submitted %u (among %u) for %p\n", idx, ret, active_thread() );
+                }
+                else
+                {
+                        // get mutual exclusion
+                        #if defined(LEADER_LOCK)
+                                while(!try_lock(ring.submit_q.submit_lock));
+                        #else
+                                lock(ring.submit_q.submit_lock __cfaabi_dbg_ctx2);
+                        #endif
+                        /* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 3ul64,
+                        /* paranoid */  "index %u already reclaimed\n"
+                        /* paranoid */  "head %u, prev %u, tail %u\n"
+                        /* paranoid */  "[-0: %u,-1: %u,-2: %u,-3: %u]\n",
+                        /* paranoid */  idx,
+                        /* paranoid */  *ring.submit_q.head, ring.submit_q.prev_head, *tail
+                        /* paranoid */  ,ring.submit_q.array[ ((*ring.submit_q.head) - 0) & (*ring.submit_q.mask) ]
+                        /* paranoid */  ,ring.submit_q.array[ ((*ring.submit_q.head) - 1) & (*ring.submit_q.mask) ]
+                        /* paranoid */  ,ring.submit_q.array[ ((*ring.submit_q.head) - 2) & (*ring.submit_q.mask) ]
+                        /* paranoid */  ,ring.submit_q.array[ ((*ring.submit_q.head) - 3) & (*ring.submit_q.mask) ]
+                        /* paranoid */ );
+                        // Append to the list of ready entries
+                        /* paranoid */ verify( idx <= mask );
+                        ring.submit_q.array[ (*tail) & mask ] = idx;
+                        __atomic_fetch_add(tail, 1ul32, __ATOMIC_SEQ_CST);
+                        // Submit however, many entries need to be submitted
+                        int ret = __io_uring_enter( ring, 1, false );
+                        if( ret < 0 ) {
+                                switch((int)errno) {
+                                default:
+                                        abort( "KERNEL ERROR: IO_URING SUBMIT - %s\n", strerror(errno) );
+                                }
+                        }
+                        /* paranoid */ verify(ret == 1);
+                        // update statistics
+                        __STATS__( false,
+                                io.submit_q.submit_avg.csm += 1;
+                                io.submit_q.submit_avg.cnt += 1;
+                        )
+                        {
+                                __attribute__((unused)) volatile __u32 * const head = ring.submit_q.head;
+                                __attribute__((unused)) __u32 last_idx = ring.submit_q.array[ ((*head) - 1) & mask ];
+                                __attribute__((unused)) volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[last_idx];
+                                __cfadbg_print_safe( io,
+                                        "Kernel I/O : last submitted is %u (%p)\n"
+                                        "    data: %p\n"
+                                        "    opcode: %s\n"
+                                        "    fd: %d\n"
+                                        "    flags: %d\n"
+                                        "    prio: %d\n"
+                                        "    off: %p\n"
+                                        "    addr: %p\n"
+                                        "    len: %d\n"
+                                        "    other flags: %d\n"
+                                        "    splice fd: %d\n"
+                                        "    pad[0]: %llu\n"
+                                        "    pad[1]: %llu\n"
+                                        "    pad[2]: %llu\n",
+                                        last_idx, sqe,
+                                        (void*)sqe->user_data,
+                                        opcodes[sqe->opcode],
+                                        sqe->fd,
+                                        sqe->flags,
+                                        sqe->ioprio,
+                                        sqe->off,
+                                        sqe->addr,
+                                        sqe->len,
+                                        sqe->accept_flags,
+                                        sqe->splice_fd_in,
+                                        sqe->__pad2[0],
+                                        sqe->__pad2[1],
+                                        sqe->__pad2[2]
+                                );
+                        }
+                        __atomic_thread_fence( __ATOMIC_SEQ_CST );
+                        // Release the consumed SQEs
+                        __release_consumed_submission( ring );
+                        // ring.submit_q.sqes[idx].user_data = 3ul64;
+                        #if defined(LEADER_LOCK)
+                                next(ring.submit_q.submit_lock);
+                        #else
+                                unlock(ring.submit_q.submit_lock);
+                        #endif
+                        __cfadbg_print_safe( io, "Kernel I/O : submitted %u for %p\n", idx, active_thread() );
+                }
+        }
+        // #define PARTIAL_SUBMIT 32
+        // go through the list of submissions in the ready array and moved them into
+        // the ring's submit queue
+        static unsigned __collect_submitions( struct __io_data & ring ) {
+                /* paranoid */ verify( ring.submit_q.ready != 0p );
+                /* paranoid */ verify( ring.submit_q.ready_cnt > 0 );
+                unsigned to_submit = 0;
+                __u32 tail = *ring.submit_q.tail;
+                const __u32 mask = *ring.submit_q.mask;
+                #if defined(PARTIAL_SUBMIT)
+                        #if defined(LEADER_LOCK)
+                                #error PARTIAL_SUBMIT and LEADER_LOCK cannot co-exist
+                        #endif
+                        const __u32 cnt = ring.submit_q.ready_cnt > PARTIAL_SUBMIT ? PARTIAL_SUBMIT : ring.submit_q.ready_cnt;
+                        const __u32 offset = ring.submit_q.prev_ready;
+                        ring.submit_q.prev_ready += cnt;
+                #else
+                        const __u32 cnt = ring.submit_q.ready_cnt;
+                        const __u32 offset = 0;
+                #endif
+                // Go through the list of ready submissions
+                for( c; cnt ) {
+                        __u32 i = (offset + c) % ring.submit_q.ready_cnt;
+                        // replace any submission with the sentinel, to consume it.
+                        __u32 idx = __atomic_exchange_n( &ring.submit_q.ready[i], -1ul32, __ATOMIC_RELAXED);
+                        // If it was already the sentinel, then we are done
+                        if( idx == -1ul32 ) continue;
+                        // If we got a real submission, append it to the list
+                        ring.submit_q.array[ (tail + to_submit) & mask ] = idx & mask;
+                        to_submit++;
+                }
+                // Increment the tail based on how many we are ready to submit
+                __atomic_fetch_add(ring.submit_q.tail, to_submit, __ATOMIC_SEQ_CST);
+                return to_submit;
+        }
+                        __submit(ctx, idxs, have, lazy);
+                        // Mark the instance as no longer in-use, re-enable interrupts and return
+                        __STATS__( true, io.submit.fast += 1; )
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                        __cfadbg_print_safe(io, "Kernel I/O : submitted on fast path\n");
+                        return;
+                }
+                // Fast path failed, fallback on arbitration
+                __STATS__( true, io.submit.slow += 1; )
+                enable_interrupts( __cfaabi_dbg_ctx );
+                __cfadbg_print_safe(io, "Kernel I/O : falling back on arbiter for submission\n");
+                __ioarbiter_submit(*inctx->arbiter, inctx, idxs, have, lazy);
+        }
+        //=============================================================================================
+        // Flushing
         // Go through the ring's submit queue and release everything that has already been consumed
         // by io_uring
+        static __u32 __release_consumed_submission( struct __io_data & ring ) {
+                const __u32 smask = *ring.submit_q.mask;
+                // We need to get the lock to copy the old head and new head
+                if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
+        // This cannot be done by multiple threads
+        static __u32 __release_sqes( struct $io_context & ctx ) {
+                const __u32 mask = *ctx.sq.mask;
                 __attribute__((unused))
                 __u32 ctail = *ring.submit_q.tail;        // get the current tail of the queue
                 __u32 chead = *ring.submit_q.head;              // get the current head of the queue
                 __u32 phead = ring.submit_q.prev_head;  // get the head the last time we were here
+                ring.submit_q.prev_head = chead;                // note up to were we processed
                 unlock(ring.submit_q.release_lock);
+                __u32 ctail = *ctx.sq.kring.tail;    // get the current tail of the queue
+                __u32 chead = *ctx.sq.kring.head;        // get the current head of the queue
+                __u32 phead = ctx.sq.kring.released; // get the head the last time we were here
+                __u32 ftail = ctx.sq.free_ring.tail;  // get the current tail of the queue
                 // the 3 fields are organized like this diagram
 …
                 __u32 count = chead - phead;
+                if(count == 0) {
+                        return 0;
+                }
                 // We acquired an previous-head/current-head range
                 // go through the range and release the sqes
                 for( i; count ) {
+                        __u32 idx = ring.submit_q.array[ (phead + i) & smask ];
+                        /* paranoid */ verify( 0 != ring.submit_q.sqes[ idx ].user_data );
+                        __clean( &ring.submit_q.sqes[ idx ] );
+                }
+                        __cfadbg_print_safe(io, "Kernel I/O : release loop\n");
+                        __u32 idx = ctx.sq.kring.array[ (phead + i) & mask ];
+                        ctx.sq.free_ring.array[ (ftail + i) & mask ] = idx;
+                }
+                ctx.sq.kring.released = chead;          // note up to were we processed
+                __atomic_store_n(&ctx.sq.free_ring.tail, ftail + count, __ATOMIC_SEQ_CST);
+                __ioarbiter_notify(ctx);
                 return count;
+        }
+        void __sqe_clean( volatile struct io_uring_sqe * sqe ) {
+                __clean( sqe );
+        }
+        static inline void __clean( volatile struct io_uring_sqe * sqe ) {
+                // If we are in debug mode, thrash the fields to make sure we catch reclamation errors
+                __cfaabi_dbg_debug_do(
+                        memset(sqe, 0xde, sizeof(*sqe));
+                        sqe->opcode = (sizeof(opcodes) / sizeof(const char *)) - 1;
+                );
+                // Mark the entry as unused
+                __atomic_store_n(&sqe->user_data, 3ul64, __ATOMIC_SEQ_CST);
+//=============================================================================================
+// I/O Arbiter
+//=============================================================================================
+        static $io_context * __ioarbiter_allocate( $io_arbiter & mutex this, processor * proc, __u32 idxs[], __u32 want ) {
+                __cfadbg_print_safe(io, "Kernel I/O : arbiter allocating\n");
+                __STATS__( false, io.alloc.block += 1; )
+                // No one has any resources left, wait for something to finish
+                // Mark as pending
+                __atomic_store_n( &this.pending.flag, true, __ATOMIC_SEQ_CST );
+                // Wait for our turn to submit
+                wait( this.pending.blocked, want );
+                __attribute((unused)) bool ret =
+                __alloc( this.pending.ctx, idxs, want);
+                /* paranoid */ verify( ret );
+                return this.pending.ctx;
+        }
+        static void __ioarbiter_notify( $io_arbiter & mutex this, $io_context * ctx ) {
+                /* paranoid */ verify( !is_empty(this.pending.blocked) );
+                this.pending.ctx = ctx;
+                while( !is_empty(this.pending.blocked) ) {
+                        __cfadbg_print_safe(io, "Kernel I/O : notifying\n");
+                        __u32 have = ctx->sq.free_ring.tail - ctx->sq.free_ring.head;
+                        __u32 want = front( this.pending.blocked );
+                        if( have > want ) return;
+                        signal_block( this.pending.blocked );
+                }
+                this.pending.flag = false;
+        }
+        static void __ioarbiter_notify( $io_context & ctx ) {
+                if(__atomic_load_n( &ctx.arbiter->pending.flag, __ATOMIC_SEQ_CST)) {
+                        __ioarbiter_notify( *ctx.arbiter, &ctx );
+                }
+        }
+        // Simply append to the pending
+        static void __ioarbiter_submit( $io_arbiter & mutex this, $io_context * ctx, __u32 idxs[], __u32 have, bool lazy ) {
+                __cfadbg_print_safe(io, "Kernel I/O : submitting %u from the arbiter to context %u\n", have, ctx->fd);
+                /* paranoid */ verify( &this == ctx->arbiter );
+                // Mark as pending
+                __atomic_store_n( &ctx->ext_sq.empty, false, __ATOMIC_SEQ_CST );
+                __cfadbg_print_safe(io, "Kernel I/O : waiting to submit %u\n", have);
+                // Wait for our turn to submit
+                wait( ctx->ext_sq.blocked );
+                // Submit our indexes
+                __submit(ctx, idxs, have, lazy);
+                __cfadbg_print_safe(io, "Kernel I/O : %u submitted from arbiter\n", have);
+        }
+        static void __ioarbiter_flush( $io_arbiter & mutex this, $io_context * ctx ) {
+                /* paranoid */ verify( &this == ctx->arbiter );
+                __STATS__( false, io.flush.external += 1; )
+                __cfadbg_print_safe(io, "Kernel I/O : arbiter flushing\n");
+                condition & blcked = ctx->ext_sq.blocked;
+                /* paranoid */ verify( ctx->ext_sq.empty == is_empty( blcked ) );
+                while(!is_empty( blcked )) {
+                        signal_block( blcked );
+                }
+                ctx->ext_sq.empty = true;
+        }
 #endif

libcfa/src/concurrency/io/call.cfa.in

-              r342af53
+              r8e4aa05
                         | IOSQE_IO_DRAIN
                 #endif
+                #if defined(CFA_HAVE_IOSQE_IO_LINK)
+                        | IOSQE_IO_LINK
+                #endif
+                #if defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+                        | IOSQE_IO_HARDLINK
+                #endif
                 #if defined(CFA_HAVE_IOSQE_ASYNC)
                         | IOSQE_ASYNC
                 #endif
+        ;
+        static const __u32 LINK_FLAGS = 0
+                #if defined(CFA_HAVE_IOSQE_IO_LINK)
+                        | IOSQE_IO_LINK
+                #endif
+                #if defined(CFA_HAVE_IOSQE_IO_HARDLINK)
+                        | IOSQE_IO_HARDLINK
+                #if defined(CFA_HAVE_IOSQE_BUFFER_SELECTED)
+                        | IOSQE_BUFFER_SELECTED
                 #endif
+        ;
 …
+        ;
+        extern [* volatile struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
+        extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
+        static inline io_context * __get_io_context( void ) {
+                cluster * cltr = active_cluster();
+                /* paranoid */ verifyf( cltr, "No active cluster for io operation\\n");
+                assertf( cltr->io.cnt > 0, "Cluster %p has no default io contexts and no context was specified\\n", cltr );
+                /* paranoid */ verifyf( cltr->io.ctxs, "default io contexts for cluster %p are missing\\n", cltr);
+                return &cltr->io.ctxs[ thread_rand() % cltr->io.cnt ];
+        }
+        extern struct $io_context * cfa_io_allocate(struct io_uring_sqe * out_sqes[], __u32 out_idxs[], __u32 want)  __attribute__((nonnull (1,2)));
+        extern void cfa_io_submit( struct $io_context * in_ctx, __u32 in_idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1,2)));
 #endif
 …
 extern "C" {
         #include <sys/types.h>
+        #include <asm/types.h>
         #include <sys/socket.h>
         #include <sys/syscall.h>
 …
         extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
         extern ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags);
+        extern ssize_t splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags);
         extern ssize_t tee(int fd_in, int fd_out, size_t len, unsigned int flags);
+}
 …
                 return ', '.join(args_a)
 AsyncTemplate = """inline void async_{name}(io_future_t & future, {params}, int submit_flags, io_cancellation * cancellation, io_context * context) {{
+AsyncTemplate = """inline void async_{name}(io_future_t & future, {params}, __u64 submit_flags) {{
         #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_{op})
                 ssize_t res = {name}({args});
 …
                 }}
         #else
-                // we don't support LINK yet
-                if( 0 != (submit_flags & LINK_FLAGS) ) {{
-                        errno = ENOTSUP; return -1;
-                }}
-                if( !context ) {{
-                        context = __get_io_context();
-                }}
-                if(cancellation) {{
-                        cancellation->target = (__u64)(uintptr_t)&future;
-                }}
                 __u8 sflags = REGULAR_FLAGS & submit_flags;
-                struct __io_data & ring = *context->thrd.ring;
                 __u32 idx;
                 struct io_uring_sqe * sqe;
                 [(volatile struct io_uring_sqe *) sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
+                struct $io_context * ctx = cfa_io_allocate( &sqe, &idx, 1 );
                 sqe->opcode = IORING_OP_{op};
+                sqe->user_data = (__u64)(uintptr_t)&future;
                 sqe->flags = sflags;
                 sqe->ioprio = 0;
 …
                 verify( sqe->user_data == (__u64)(uintptr_t)&future );
                 __submit( context, idx );
+                cfa_io_submit( ctx, &idx, 1, 0 != (submit_flags & CFA_IO_LAZY) );
         #endif
 }}"""
+SyncTemplate = """{ret} cfa_{name}({params}, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context) {{
+        if( timeout >= 0 ) {{
+                errno = ENOTSUP;
+                return -1;
+        }}
+SyncTemplate = """{ret} cfa_{name}({params}, __u64 submit_flags) {{
         io_future_t future;
         async_{name}( future, {args}, submit_flags, cancellation, context );
+        async_{name}( future, {args}, submit_flags );
         wait( future );
 …
         }),
         # CFA_HAVE_IORING_OP_SPLICE
         Call('SPLICE', 'ssize_t splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags)', {
+        Call('SPLICE', 'ssize_t splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags)', {
                 'splice_fd_in': 'fd_in',
                 'splice_off_in': 'off_in ? (__u64)*off_in : (__u64)-1',
 …
         if c.define:
                 print("""#if defined({define})
         {ret} cfa_{name}({params}, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+        {ret} cfa_{name}({params}, __u64 submit_flags);
 #endif""".format(define=c.define,ret=c.ret, name=c.name, params=c.params))
         else:
                 print("{ret} cfa_{name}({params}, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);"
+                print("{ret} cfa_{name}({params}, __u64 submit_flags);"
                 .format(ret=c.ret, name=c.name, params=c.params))
 …
         if c.define:
                 print("""#if defined({define})
         void async_{name}(io_future_t & future, {params}, int submit_flags, io_cancellation * cancellation, io_context * context);
+        void async_{name}(io_future_t & future, {params}, __u64 submit_flags);
 #endif""".format(define=c.define,name=c.name, params=c.params))
         else:
                 print("void async_{name}(io_future_t & future, {params}, int submit_flags, io_cancellation * cancellation, io_context * context);"
+                print("void async_{name}(io_future_t & future, {params}, __u64 submit_flags);"
                 .format(name=c.name, params=c.params))
 print("\n")
 …
 print("""
-//-----------------------------------------------------------------------------
-bool cancel(io_cancellation & this) {
-        #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_ASYNC_CANCEL)
-                return false;
-        #else
-                io_future_t future;
-                io_context * context = __get_io_context();
-                __u8 sflags = 0;
-                struct __io_data & ring = *context->thrd.ring;
-                __u32 idx;
-                volatile struct io_uring_sqe * sqe;
-                [sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
-                sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
-                sqe->opcode = IORING_OP_ASYNC_CANCEL;
-                sqe->flags = sflags;
-                sqe->addr = this.target;
-                verify( sqe->user_data == (__u64)(uintptr_t)&future );
-                __submit( context, idx );
-                wait(future);
-                if( future.result == 0 ) return true; // Entry found
-                if( future.result == -EALREADY) return true; // Entry found but in progress
-                if( future.result == -ENOENT ) return false; // Entry not found
-                return false;
-        #endif
+}
 //-----------------------------------------------------------------------------
 // Check if a function is has asynchronous

libcfa/src/concurrency/io/setup.cfa

-              r342af53
+              r8e4aa05
 #if !defined(CFA_HAVE_LINUX_IO_URING_H)
-        void __kernel_io_startup() {
-                // Nothing to do without io_uring
+        }
-        void __kernel_io_shutdown() {
-                // Nothing to do without io_uring
+        }
         void ?{}(io_context_params & this) {}
+        void ?{}(io_context & this, struct cluster & cl) {}
+        void ?{}(io_context & this, struct cluster & cl, const io_context_params & params) {}
+        void ^?{}(io_context & this) {}
+        void ^?{}(io_context & this, bool cluster_context) {}
+        void  ?{}($io_context & this, struct cluster & cl) {}
+        void ^?{}($io_context & this) {}
+        void __cfa_io_start( processor * proc ) {}
+        void __cfa_io_flush( processor * proc ) {}
+        void __cfa_io_stop ( processor * proc ) {}
+        $io_arbiter * create(void) { return 0p; }
+        void destroy($io_arbiter *) {}
 #else
 …
         void ?{}(io_context_params & this) {
                 this.num_entries = 256;
-                this.num_ready = 256;
-                this.submit_aff = -1;
-                this.eager_submits = false;
-                this.poller_submits = false;
-                this.poll_submit = false;
-                this.poll_complete = false;
+        }
 …
 //=============================================================================================
-// I/O Startup / Shutdown logic + Master Poller
-//=============================================================================================
-        // IO Master poller loop forward
-        static void * iopoll_loop( __attribute__((unused)) void * args );
-        static struct {
-                pthread_t     thrd;    // pthread handle to io poller thread
-                void *        stack;   // pthread stack for io poller thread
-                int           epollfd; // file descriptor to the epoll instance
-                volatile bool run;     // Whether or not to continue
-        } iopoll;
-        void __kernel_io_startup(void) {
-                __cfadbg_print_safe(io_core, "Kernel : Creating EPOLL instance\n" );
-                iopoll.epollfd = epoll_create1(0);
-                if (iopoll.epollfd == -1) {
-                        abort( "internal error, epoll_create1\n");
+                }
-                __cfadbg_print_safe(io_core, "Kernel : Starting io poller thread\n" );
-                iopoll.run = true;
-                iopoll.stack = __create_pthread( &iopoll.thrd, iopoll_loop, 0p );
+        }
-        void __kernel_io_shutdown(void) {
-                // Notify the io poller thread of the shutdown
-                iopoll.run = false;
-                sigval val = { 1 };
-                pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
-                // Wait for the io poller thread to finish
-                __destroy_pthread( iopoll.thrd, iopoll.stack, 0p );
-                int ret = close(iopoll.epollfd);
-                if (ret == -1) {
-                        abort( "internal error, close epoll\n");
+                }
-                // Io polling is now fully stopped
-                __cfadbg_print_safe(io_core, "Kernel : IO poller stopped\n" );
+        }
-        static void * iopoll_loop( __attribute__((unused)) void * args ) {
-                __processor_id_t id;
-                id.full_proc = false;
-                id.id = doregister(&id);
-                __cfaabi_tls.this_proc_id = &id;
-                __cfadbg_print_safe(io_core, "Kernel : IO poller thread starting\n" );
-                // Block signals to control when they arrive
-                sigset_t mask;
-                sigfillset(&mask);
-                if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
-                abort( "internal error, pthread_sigmask" );
+                }
-                sigdelset( &mask, SIGUSR1 );
-                // Create sufficient events
-                struct epoll_event events[10];
-                // Main loop
-                while( iopoll.run ) {
-                        __cfadbg_print_safe(io_core, "Kernel I/O - epoll : waiting on io_uring contexts\n");
-                        // Wait for events
-                        int nfds = epoll_pwait( iopoll.epollfd, events, 10, -1, &mask );
-                        __cfadbg_print_safe(io_core, "Kernel I/O - epoll : %d io contexts events, waking up\n", nfds);
-                        // Check if an error occured
-                        if (nfds == -1) {
-                                if( errno == EINTR ) continue;
-                                abort( "internal error, pthread_sigmask" );
+                        }
-                        for(i; nfds) {
-                                $io_ctx_thread * io_ctx = ($io_ctx_thread *)(uintptr_t)events[i].data.u64;
-                                /* paranoid */ verify( io_ctx );
-                                __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Unparking io poller %d (%p)\n", io_ctx->ring->fd, io_ctx);
-                                #if !defined( __CFA_NO_STATISTICS__ )
-                                        __cfaabi_tls.this_stats = io_ctx->self.curr_cluster->stats;
-                                #endif
-                                eventfd_t v;
-                                eventfd_read(io_ctx->ring->efd, &v);
-                                post( io_ctx->sem );
+                        }
+                }
-                __cfadbg_print_safe(io_core, "Kernel : IO poller thread stopping\n" );
-                unregister(&id);
-                return 0p;
+        }
-//=============================================================================================
 // I/O Context Constrution/Destruction
 //=============================================================================================
+        void ?{}($io_ctx_thread & this, struct cluster & cl) { (this.self){ "IO Poller", cl }; }
+        void main( $io_ctx_thread & this );
+        static inline $thread * get_thread( $io_ctx_thread & this ) { return &this.self; }
+        void ^?{}( $io_ctx_thread & mutex this ) {}
+        static void __io_create ( __io_data & this, const io_context_params & params_in );
+        static void __io_destroy( __io_data & this );
+        void ?{}(io_context & this, struct cluster & cl, const io_context_params & params) {
+                (this.thrd){ cl };
+                this.thrd.ring = malloc();
+                __cfadbg_print_safe(io_core, "Kernel I/O : Creating ring for io_context %p\n", &this);
+                __io_create( *this.thrd.ring, params );
+                __cfadbg_print_safe(io_core, "Kernel I/O : Starting poller thread for io_context %p\n", &this);
+                this.thrd.done = false;
+                __thrd_start( this.thrd, main );
+                __cfadbg_print_safe(io_core, "Kernel I/O : io_context %p ready\n", &this);
+        }
+        void ?{}(io_context & this, struct cluster & cl) {
+                io_context_params params;
+                (this){ cl, params };
+        }
+        void ^?{}(io_context & this, bool cluster_context) {
+                __cfadbg_print_safe(io_core, "Kernel I/O : tearing down io_context %p\n", &this);
+                // Notify the thread of the shutdown
+                __atomic_store_n(&this.thrd.done, true, __ATOMIC_SEQ_CST);
+                // If this is an io_context within a cluster, things get trickier
+                $thread & thrd = this.thrd.self;
+                if( cluster_context ) {
+                        // We are about to do weird things with the threads
+                        // we don't need interrupts to complicate everything
+                        disable_interrupts();
+                        // Get cluster info
+                        cluster & cltr = *thrd.curr_cluster;
+                        /* paranoid */ verify( cltr.idles.total == 0 || &cltr == mainCluster );
+                        /* paranoid */ verify( !ready_mutate_islocked() );
+                        // We need to adjust the clean-up based on where the thread is
+                        if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
+                                // This is the tricky case
+                                // The thread was preempted or ready to run and now it is on the ready queue
+                                // but the cluster is shutting down, so there aren't any processors to run the ready queue
+                                // the solution is to steal the thread from the ready-queue and pretend it was blocked all along
+                                ready_schedule_lock();
+                                        // The thread should on the list
+                                        /* paranoid */ verify( thrd.link.next != 0p );
+                                        // Remove the thread from the ready queue of this cluster
+                                        // The thread should be the last on the list
+                                        __attribute__((unused)) bool removed = remove_head( &cltr, &thrd );
+                                        /* paranoid */ verify( removed );
+                                        thrd.link.next = 0p;
+                                        thrd.link.prev = 0p;
+                                        // Fixup the thread state
+                                        thrd.state = Blocked;
+                                        thrd.ticket = TICKET_BLOCKED;
+                                        thrd.preempted = __NO_PREEMPTION;
+                                ready_schedule_unlock();
+                                // Pretend like the thread was blocked all along
+                        }
+                        // !!! This is not an else if !!!
+                        // Ok, now the thread is blocked (whether we cheated to get here or not)
+                        if( thrd.state == Blocked ) {
+                                // This is the "easy case"
+                                // The thread is parked and can easily be moved to active cluster
+                                verify( thrd.curr_cluster != active_cluster() || thrd.curr_cluster == mainCluster );
+                                thrd.curr_cluster = active_cluster();
+                                // unpark the fast io_poller
+                                unpark( &thrd );
+                        }
+                        else {
+                                // The thread is in a weird state
+                                // I don't know what to do here
+                                abort("io_context poller thread is in unexpected state, cannot clean-up correctly\n");
+                        }
+                        // The weird thread kidnapping stuff is over, restore interrupts.
+                        enable_interrupts( __cfaabi_dbg_ctx );
+                } else {
+                        post( this.thrd.sem );
+                }
+                ^(this.thrd){};
+                __cfadbg_print_safe(io_core, "Kernel I/O : Stopped poller thread for io_context %p\n", &this);
+                __io_destroy( *this.thrd.ring );
+                __cfadbg_print_safe(io_core, "Kernel I/O : Destroyed ring for io_context %p\n", &this);
+                free(this.thrd.ring);
+        }
+        void ^?{}(io_context & this) {
+                ^(this){ false };
+        }
+        static void __io_create( __io_data & this, const io_context_params & params_in ) {
+        static void __io_uring_setup ( $io_context & this, const io_context_params & params_in, int procfd );
+        static void __io_uring_teardown( $io_context & this );
+        static void __epoll_register($io_context & ctx);
+        static void __epoll_unregister($io_context & ctx);
+        void __ioarbiter_register( $io_arbiter & mutex, $io_context & ctx );
+        void __ioarbiter_unregister( $io_arbiter & mutex, $io_context & ctx );
+        void ?{}($io_context & this, processor * proc, struct cluster & cl) {
+                /* paranoid */ verify( cl.io.arbiter );
+                this.proc = proc;
+                this.arbiter = cl.io.arbiter;
+                this.ext_sq.empty = true;
+                (this.ext_sq.blocked){};
+                __io_uring_setup( this, cl.io.params, proc->idle );
+                __cfadbg_print_safe(io_core, "Kernel I/O : Created ring for io_context %u (%p)\n", this.fd, &this);
+        }
+        void ^?{}($io_context & this) {
+                __cfadbg_print_safe(io_core, "Kernel I/O : tearing down io_context %u\n", this.fd);
+                __io_uring_teardown( this );
+                __cfadbg_print_safe(io_core, "Kernel I/O : Destroyed ring for io_context %u\n", this.fd);
+        }
+        extern void __disable_interrupts_hard();
+        extern void __enable_interrupts_hard();
+        static void __io_uring_setup( $io_context & this, const io_context_params & params_in, int procfd ) {
                 // Step 1 : call to setup
                 struct io_uring_params params;
                 memset(&params, 0, sizeof(params));
                 if( params_in.poll_submit   ) params.flags |= IORING_SETUP_SQPOLL;
                 if( params_in.poll_complete ) params.flags |= IORING_SETUP_IOPOLL;
+                // if( params_in.poll_submit   ) params.flags |= IORING_SETUP_SQPOLL;
+                // if( params_in.poll_complete ) params.flags |= IORING_SETUP_IOPOLL;
                 __u32 nentries = params_in.num_entries != 0 ? params_in.num_entries : 256;
 …
                         abort("ERROR: I/O setup 'num_entries' must be a power of 2\n");
+                }
-                if( params_in.poller_submits && params_in.eager_submits ) {
-                        abort("ERROR: I/O setup 'poller_submits' and 'eager_submits' cannot be used together\n");
+                }
                 int fd = syscall(__NR_io_uring_setup, nentries, &params );
 …
                 // Step 2 : mmap result
+                memset( &this, 0, sizeof(struct __io_data) );
+                struct __submition_data  & sq = this.submit_q;
+                struct __completion_data & cq = this.completion_q;
+                struct __sub_ring_t & sq = this.sq;
+                struct __cmp_ring_t & cq = this.cq;
                 // calculate the right ring size
 …
                 // Get the pointers from the kernel to fill the structure
                 // submit queue
+                sq.head    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+                sq.tail    = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+                sq.mask    = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+                sq.num     = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+                sq.flags   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+                sq.dropped = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+                sq.array   = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+                sq.prev_head = *sq.head;
+                {
+                        const __u32 num = *sq.num;
+                        for( i; num ) {
+                                __sqe_clean( &sq.sqes[i] );
+                        }
+                }
+                (sq.submit_lock){};
+                (sq.release_lock){};
+                if( params_in.poller_submits || params_in.eager_submits ) {
+                        /* paranoid */ verify( is_pow2( params_in.num_ready ) || (params_in.num_ready < 8) );
+                        sq.ready_cnt = max( params_in.num_ready, 8 );
+                        sq.ready = alloc( sq.ready_cnt, 64`align );
+                        for(i; sq.ready_cnt) {
+                                sq.ready[i] = -1ul32;
+                        }
+                        sq.prev_ready = 0;
+                }
+                else {
+                        sq.ready_cnt = 0;
+                        sq.ready = 0p;
+                        sq.prev_ready = 0;
+                }
+                sq.kring.head  = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.head);
+                sq.kring.tail  = (volatile __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.tail);
+                sq.kring.array = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.array);
+                sq.mask        = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_mask);
+                sq.num         = (   const __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.ring_entries);
+                sq.flags       = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.flags);
+                sq.dropped     = (         __u32 *)(((intptr_t)sq.ring_ptr) + params.sq_off.dropped);
+                sq.kring.released = 0;
+                sq.free_ring.head = 0;
+                sq.free_ring.tail = *sq.num;
+                sq.free_ring.array = alloc( *sq.num, 128`align );
+                for(i; (__u32)*sq.num) {
+                        sq.free_ring.array[i] = i;
+                }
+                sq.to_submit = 0;
                 // completion queue
 …
                 // Step 4 : eventfd
+                int efd;
+                for() {
+                        efd = eventfd(0, 0);
+                        if (efd < 0) {
+                                if (errno == EINTR) continue;
+                                abort("KERNEL ERROR: IO_URING EVENTFD - %s\n", strerror(errno));
+                        }
+                        break;
+                }
+                int ret;
+                for() {
+                        ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &efd, 1);
+                        if (ret < 0) {
+                                if (errno == EINTR) continue;
+                                abort("KERNEL ERROR: IO_URING EVENTFD REGISTER - %s\n", strerror(errno));
+                        }
+                        break;
+                }
+                // io_uring_register is so f*cking slow on some machine that it
+                // will never succeed if preemption isn't hard blocked
+                __cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd);
+                __disable_interrupts_hard();
+                int ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &procfd, 1);
+                if (ret < 0) {
+                        abort("KERNEL ERROR: IO_URING EVENTFD REGISTER - %s\n", strerror(errno));
+                }
+                __enable_interrupts_hard();
+                __cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd);
                 // some paranoid checks
 …
                 /* paranoid */ verifyf( (*sq.mask) == ((*sq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*sq.num) - 1ul32, *sq.num, *sq.mask );
                 /* paranoid */ verifyf( (*sq.num) >= nentries, "IO_URING Expected %u entries, got %u", nentries, *sq.num );
                 /* paranoid */ verifyf( (*sq.head) == 0, "IO_URING Expected head to be 0, got %u", *sq.head );
                 /* paranoid */ verifyf( (*sq.tail) == 0, "IO_URING Expected tail to be 0, got %u", *sq.tail );
+                /* paranoid */ verifyf( (*sq.kring.head) == 0, "IO_URING Expected head to be 0, got %u", *sq.kring.head );
+                /* paranoid */ verifyf( (*sq.kring.tail) == 0, "IO_URING Expected tail to be 0, got %u", *sq.kring.tail );
                 // Update the global ring info
                 this.ring_flags = params.flags;
+                this.ring_flags = 0;
                 this.fd         = fd;
+                this.efd        = efd;
+                this.eager_submits  = params_in.eager_submits;
+                this.poller_submits = params_in.poller_submits;
+        }
+        static void __io_destroy( __io_data & this ) {
+        }
+        static void __io_uring_teardown( $io_context & this ) {
                 // Shutdown the io rings
                 struct __submition_data  & sq = this.submit_q;
                 struct __completion_data & cq = this.completion_q;
+                struct __sub_ring_t & sq = this.sq;
+                struct __cmp_ring_t & cq = this.cq;
                 // unmap the submit queue entries
 …
                 // close the file descriptor
                 close(this.fd);
+                close(this.efd);
+                free( this.submit_q.ready ); // Maybe null, doesn't matter
+                free( this.sq.free_ring.array ); // Maybe null, doesn't matter
+        }
+        void __cfa_io_start( processor * proc ) {
+                proc->io.ctx = alloc();
+                (*proc->io.ctx){proc, *proc->cltr};
+        }
+        void __cfa_io_stop ( processor * proc ) {
+                ^(*proc->io.ctx){};
+                free(proc->io.ctx);
+        }
 …
 // I/O Context Sleep
 //=============================================================================================
+        #define IOEVENTS EPOLLIN | EPOLLONESHOT
+        static inline void __ioctx_epoll_ctl($io_ctx_thread & ctx, int op, const char * error) {
+                struct epoll_event ev;
+                ev.events = IOEVENTS;
+                ev.data.u64 = (__u64)&ctx;
+                int ret = epoll_ctl(iopoll.epollfd, op, ctx.ring->efd, &ev);
+                if (ret < 0) {
+                        abort( "KERNEL ERROR: EPOLL %s - (%d) %s\n", error, (int)errno, strerror(errno) );
+                }
+        }
+        void __ioctx_register($io_ctx_thread & ctx) {
+                __ioctx_epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
+        }
+        void __ioctx_prepare_block($io_ctx_thread & ctx) {
+                __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.ring->fd, &ctx);
+                __ioctx_epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
+        }
+        // static inline void __epoll_ctl($io_context & ctx, int op, const char * error) {
+        //      struct epoll_event ev;
+        //      ev.events = EPOLLIN | EPOLLONESHOT;
+        //      ev.data.u64 = (__u64)&ctx;
+        //      int ret = epoll_ctl(iopoll.epollfd, op, ctx.efd, &ev);
+        //      if (ret < 0) {
+        //              abort( "KERNEL ERROR: EPOLL %s - (%d) %s\n", error, (int)errno, strerror(errno) );
+        //      }
+        // }
+        // static void __epoll_register($io_context & ctx) {
+        //      __epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
+        // }
+        // static void __epoll_unregister($io_context & ctx) {
+        //      // Read the current epoch so we know when to stop
+        //      size_t curr = __atomic_load_n(&iopoll.epoch, __ATOMIC_SEQ_CST);
+        //      // Remove the fd from the iopoller
+        //      __epoll_ctl(ctx, EPOLL_CTL_DEL, "REMOVE");
+        //      // Notify the io poller thread of the shutdown
+        //      iopoll.run = false;
+        //      sigval val = { 1 };
+        //      pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
+        //      // Make sure all this is done
+        //      __atomic_thread_fence(__ATOMIC_SEQ_CST);
+        //      // Wait for the next epoch
+        //      while(curr == iopoll.epoch && !iopoll.stopped) Pause();
+        // }
+        // void __ioctx_prepare_block($io_context & ctx) {
+        //      __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.fd, &ctx);
+        //      __epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
+        // }
 //=============================================================================================
 // I/O Context Misc Setup
 //=============================================================================================
+        void register_fixed_files( io_context & ctx, int * files, unsigned count ) {
+                int ret = syscall( __NR_io_uring_register, ctx.thrd.ring->fd, IORING_REGISTER_FILES, files, count );
+                if( ret < 0 ) {
+                        abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+                }
+                __cfadbg_print_safe( io_core, "Kernel I/O : Performed io_register for %p, returned %d\n", active_thread(), ret );
+        }
+        void register_fixed_files( cluster & cltr, int * files, unsigned count ) {
+                for(i; cltr.io.cnt) {
+                        register_fixed_files( cltr.io.ctxs[i], files, count );
+                }
+        }
+        void ?{}( $io_arbiter & this ) {
+                this.pending.flag = false;
+        }
+        void ^?{}( $io_arbiter & mutex this ) {
+                // /* paranoid */ verify( empty(this.assigned) );
+                // /* paranoid */ verify( empty(this.available) );
+                /* paranoid */ verify( is_empty(this.pending.blocked) );
+        }
+        $io_arbiter * create(void) {
+                return new();
+        }
+        void destroy($io_arbiter * arbiter) {
+                delete(arbiter);
+        }
+//=============================================================================================
+// I/O Context Misc Setup
+//=============================================================================================
 #endif

libcfa/src/concurrency/io/types.hfa

-              r342af53
+              r8e4aa05
 // file "LICENCE" distributed with Cforall.
 //
+// io/types.hfa --
+// io/types.hfa -- PRIVATE
+// Types used by the I/O subsystem
 //
 // Author           : Thierry Delisle
 …
 #include "bits/locks.hfa"
+#include "kernel/fwd.hfa"
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+        #define LEADER_LOCK
+        struct __leaderlock_t {
+                struct $thread * volatile value;        // ($thread) next_leader | (bool:1) is_locked
+        };
+        #include "bits/sequence.hfa"
+        #include "monitor.hfa"
+        static inline void ?{}( __leaderlock_t & this ) { this.value = 0p; }
+        struct processor;
+        monitor $io_arbiter;
         //-----------------------------------------------------------------------
         // Ring Data structure
+      struct __submition_data {
+                // Head and tail of the ring (associated with array)
+                volatile __u32 * head;
+                volatile __u32 * tail;
+                volatile __u32 prev_head;
+      struct __sub_ring_t {
+                struct {
+                        // Head and tail of the ring (associated with array)
+                        volatile __u32 * head;   // one passed last index consumed by the kernel
+                        volatile __u32 * tail;   // one passed last index visible to the kernel
+                        volatile __u32 released; // one passed last index released back to the free list
+                // The actual kernel ring which uses head/tail
+                // indexes into the sqes arrays
+                __u32 * array;
+                        // The actual kernel ring which uses head/tail
+                        // indexes into the sqes arrays
+                        __u32 * array;
+                } kring;
+                struct {
+                        volatile __u32 head;
+                        volatile __u32 tail;
+                        // The ring which contains free allocations
+                        // indexes into the sqes arrays
+                        __u32 * array;
+                } free_ring;
+                // number of sqes to submit on next system call.
+                __u32 to_submit;
                 // number of entries and mask to go with it
 …
                 const __u32 * mask;
                 // Submission flags (Not sure what for)
+                // Submission flags, currently only IORING_SETUP_SQPOLL
                 __u32 * flags;
+                // number of sqes not submitted (whatever that means)
+                // number of sqes not submitted
+                // From documentation : [dropped] is incremented for each invalid submission queue entry encountered in the ring buffer.
                 __u32 * dropped;
-                // Like head/tail but not seen by the kernel
-                volatile __u32 * ready;
-                __u32 ready_cnt;
-                __u32 prev_ready;
-                #if defined(LEADER_LOCK)
-                        __leaderlock_t submit_lock;
-                #else
-                        __spinlock_t submit_lock;
-                #endif
-                __spinlock_t  release_lock;
                 // A buffer of sqes (not the actual ring)
                 volatile struct io_uring_sqe * sqes;
+                struct io_uring_sqe * sqes;
                 // The location and size of the mmaped area
 …
         };
         struct __completion_data {
+        struct __cmp_ring_t {
                 // Head and tail of the ring
                 volatile __u32 * head;
 …
                 const __u32 * num;
                 // number of cqes not submitted (whatever that means)
+                // I don't know what this value is for
                 __u32 * overflow;
 …
         };
+        struct __io_data {
+                struct __submition_data submit_q;
+                struct __completion_data completion_q;
+        struct __attribute__((aligned(128))) $io_context {
+                $io_arbiter * arbiter;
+                processor * proc;
+                struct {
+                        volatile bool empty;
+                        condition blocked;
+                } ext_sq;
+                struct __sub_ring_t sq;
+                struct __cmp_ring_t cq;
                 __u32 ring_flags;
                 int fd;
+                int efd;
+                bool eager_submits:1;
+                bool poller_submits:1;
+        };
+        monitor __attribute__((aligned(128))) $io_arbiter {
+                struct {
+                        condition blocked;
+                        $io_context * ctx;
+                        volatile bool flag;
+                } pending;
         };
 …
         #endif
+        struct $io_ctx_thread;
+        void __ioctx_register($io_ctx_thread & ctx);
+        void __ioctx_prepare_block($io_ctx_thread & ctx);
+        void __sqe_clean( volatile struct io_uring_sqe * sqe );
+        // void __ioctx_prepare_block($io_context & ctx);
 #endif

libcfa/src/concurrency/iofwd.hfa

-              r342af53
+              r8e4aa05
 #include <unistd.h>
 extern "C" {
         #include <sys/types.h>
+        #include <asm/types.h>
         #if CFA_HAVE_LINUX_IO_URING_H
                 #include <linux/io_uring.h>
 …
 struct cluster;
 struct io_future_t;
+struct io_context;
+struct io_cancellation;
+struct $io_context;
 struct iovec;
 …
 struct sockaddr;
 struct statx;
+struct epoll_event;
+//----------
+// underlying calls
+extern struct $io_context * cfa_io_allocate(struct io_uring_sqe * out_sqes[], __u32 out_idxs[], __u32 want)  __attribute__((nonnull (1,2)));
+extern void cfa_io_submit( struct $io_context * in_ctx, __u32 in_idxs[], __u32 have, bool lazy ) __attribute__((nonnull (1,2)));
 //----------
 // synchronous calls
 #if defined(CFA_HAVE_PREADV2)
         extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+        extern ssize_t cfa_preadv2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+        extern ssize_t cfa_pwritev2(int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 extern int cfa_fsync(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern int cfa_fallocate(int fd, int mode, off_t offset, off_t len, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern int cfa_posix_fadvise(int fd, off_t offset, off_t len, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern int cfa_madvise(void *addr, size_t length, int advice, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_fsync(int fd, __u64 submit_flags);
+extern int cfa_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern int cfa_sync_file_range(int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
+extern  ssize_t cfa_sendmsg(int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_recvmsg(int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
+extern ssize_t cfa_send(int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
+extern ssize_t cfa_recv(int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
+extern int cfa_accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
+extern int cfa_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern int cfa_fallocate(int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
+extern int cfa_posix_fadvise(int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
+extern int cfa_madvise(void *addr, size_t length, int advice, __u64 submit_flags);
+extern int cfa_openat(int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
         extern int cfa_openat2(int dirfd, const char *pathname, struct open_how * how, size_t size, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+        extern int cfa_openat2(int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
 extern int cfa_close(int fd, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern int cfa_close(int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
         extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+        extern int cfa_statx(int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
 #endif
 extern ssize_t cfa_read(int fd, void * buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern ssize_t cfa_write(int fd, void * buf, size_t count, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern ssize_t cfa_splice(int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
 extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags, Duration timeout, io_cancellation * cancellation, io_context * context);
+extern ssize_t cfa_read(int fd, void * buf, size_t count, __u64 submit_flags);
+extern ssize_t cfa_write(int fd, void * buf, size_t count, __u64 submit_flags);
+extern ssize_t cfa_splice(int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern ssize_t cfa_tee(int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);
 //----------
 // asynchronous calls
 #if defined(CFA_HAVE_PREADV2)
         extern void async_preadv2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+        extern void async_preadv2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 #if defined(CFA_HAVE_PWRITEV2)
         extern void async_pwritev2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+        extern void async_pwritev2(io_future_t & future, int fd, const struct iovec *iov, int iovcnt, off_t offset, int flags, __u64 submit_flags);
 #endif
 extern void async_fsync(io_future_t & future, int fd, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event *event, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_sync_file_range(io_future_t & future, int fd, off64_t offset, off64_t nbytes, unsigned int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr *msg, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr *msg, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_send(io_future_t & future, int sockfd, const void *buf, size_t len, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_recv(io_future_t & future, int sockfd, void *buf, size_t len, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_accept4(io_future_t & future, int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_connect(io_future_t & future, int sockfd, const struct sockaddr *addr, socklen_t addrlen, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_fallocate(io_future_t & future, int fd, int mode, off_t offset, off_t len, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_posix_fadvise(io_future_t & future, int fd, off_t offset, off_t len, int advice, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_madvise(io_future_t & future, void *addr, size_t length, int advice, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_openat(io_future_t & future, int dirfd, const char *pathname, int flags, mode_t mode, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_fsync(io_future_t & future, int fd, __u64 submit_flags);
+extern void async_epoll_ctl(io_future_t & future, int epfd, int op, int fd, struct epoll_event *event, __u64 submit_flags);
+extern void async_sync_file_range(io_future_t & future, int fd, off64_t offset, off64_t nbytes, unsigned int flags, __u64 submit_flags);
+extern void async_sendmsg(io_future_t & future, int sockfd, const struct msghdr *msg, int flags, __u64 submit_flags);
+extern void async_recvmsg(io_future_t & future, int sockfd, struct msghdr *msg, int flags, __u64 submit_flags);
+extern void async_send(io_future_t & future, int sockfd, const void *buf, size_t len, int flags, __u64 submit_flags);
+extern void async_recv(io_future_t & future, int sockfd, void *buf, size_t len, int flags, __u64 submit_flags);
+extern void async_accept4(io_future_t & future, int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags, __u64 submit_flags);
+extern void async_connect(io_future_t & future, int sockfd, const struct sockaddr *addr, socklen_t addrlen, __u64 submit_flags);
+extern void async_fallocate(io_future_t & future, int fd, int mode, off_t offset, off_t len, __u64 submit_flags);
+extern void async_posix_fadvise(io_future_t & future, int fd, off_t offset, off_t len, int advice, __u64 submit_flags);
+extern void async_madvise(io_future_t & future, void *addr, size_t length, int advice, __u64 submit_flags);
+extern void async_openat(io_future_t & future, int dirfd, const char *pathname, int flags, mode_t mode, __u64 submit_flags);
 #if defined(CFA_HAVE_OPENAT2)
         extern void async_openat2(io_future_t & future, int dirfd, const char *pathname, struct open_how * how, size_t size, int submit_flags, io_cancellation * cancellation, io_context * context);
+        extern void async_openat2(io_future_t & future, int dirfd, const char *pathname, struct open_how * how, size_t size, __u64 submit_flags);
 #endif
 extern void async_close(io_future_t & future, int fd, int submit_flags, io_cancellation * cancellation, io_context * context);
+extern void async_close(io_future_t & future, int fd, __u64 submit_flags);
 #if defined(CFA_HAVE_STATX)
         extern void async_statx(io_future_t & future, int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, int submit_flags, io_cancellation * cancellation, io_context * context);
+        extern void async_statx(io_future_t & future, int dirfd, const char *pathname, int flags, unsigned int mask, struct statx *statxbuf, __u64 submit_flags);
 #endif
 void async_read(io_future_t & future, int fd, void * buf, size_t count, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_write(io_future_t & future, int fd, void * buf, size_t count, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_splice(io_future_t & future, int fd_in, loff_t *off_in, int fd_out, loff_t *off_out, size_t len, unsigned int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
 extern void async_tee(io_future_t & future, int fd_in, int fd_out, size_t len, unsigned int flags, int submit_flags, io_cancellation * cancellation, io_context * context);
+void async_read(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
+extern void async_write(io_future_t & future, int fd, void * buf, size_t count, __u64 submit_flags);
+extern void async_splice(io_future_t & future, int fd_in, __off64_t *off_in, int fd_out, __off64_t *off_out, size_t len, unsigned int flags, __u64 submit_flags);
+extern void async_tee(io_future_t & future, int fd_in, int fd_out, size_t len, unsigned int flags, __u64 submit_flags);
 …
 // Check if a function is blocks a only the user thread
 bool has_user_level_blocking( fptr_t func );
-//-----------------------------------------------------------------------------
-void register_fixed_files( io_context & ctx , int * files, unsigned count );
-void register_fixed_files( cluster    & cltr, int * files, unsigned count );

libcfa/src/concurrency/kernel.cfa

-              r342af53
+              r8e4aa05
 #include <signal.h>
 #include <unistd.h>
+extern "C" {
+        #include <sys/eventfd.h>
+}
 //CFA Includes
 …
 static [unsigned idle, unsigned total, * processor] query( & __cluster_idles idles );
+extern void __cfa_io_start( processor * );
+extern void __cfa_io_drain( processor * );
+extern void __cfa_io_flush( processor * );
+extern void __cfa_io_stop ( processor * );
+static inline void __maybe_io_drain( processor * );
+extern void __disable_interrupts_hard();
+extern void __enable_interrupts_hard();
 //=============================================================================================
 …
         verify(this);
+        __cfa_io_start( this );
         __cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
         #if !defined(__CFA_NO_STATISTICS__)
 …
                 preemption_scope scope = { this };
+                #if !defined(__CFA_NO_STATISTICS__)
+                        unsigned long long last_tally = rdtscl();
+                #endif
                 __cfadbg_print_safe(runtime_core, "Kernel : core %p started\n", this);
 …
                 MAIN_LOOP:
                 for() {
+                        // Check if there is pending io
+                        __maybe_io_drain( this );
                         // Try to get the next thread
                         readyThread = __next_thread( this->cltr );
                         if( !readyThread ) {
+                                __cfa_io_flush( this );
                                 readyThread = __next_thread_slow( this->cltr );
+                        }
 …
                                 #endif
+                                wait( this->idle );
+                                __cfadbg_print_safe(runtime_core, "Kernel : core %p waiting on eventfd %d\n", this, this->idle);
+                                __disable_interrupts_hard();
+                                eventfd_t val;
+                                eventfd_read( this->idle, &val );
+                                __enable_interrupts_hard();
                                 #if !defined(__CFA_NO_STATISTICS__)
 …
                         /* paranoid */ verify( readyThread );
+                        // Reset io dirty bit
+                        this->io.dirty = false;
                         // We found a thread run it
                         __run_thread(this, readyThread);
 …
                         // Are we done?
                         if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                unsigned long long curr = rdtscl();
+                                if(curr > (last_tally + 500000000)) {
+                                        __tally_stats(this->cltr->stats, __cfaabi_tls.this_stats);
+                                        last_tally = curr;
+                                }
+                        #endif
+                        if(this->io.pending && !this->io.dirty) {
+                                __cfa_io_flush( this );
+                        }
+                }
 …
+        }
+        V( this->terminated );
+        __cfa_io_stop( this );
+        post( this->terminated );
         if(this == mainProcessor) {
 …
         /* paranoid */ verifyf( thrd_dst->link.next == 0p, "Expected null got %p", thrd_dst->link.next );
         __builtin_prefetch( thrd_dst->context.SP );
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p running thread %p (%s)\n", this, thrd_dst, thrd_dst->self_cor.name);
         $coroutine * proc_cor = get_coroutine(this->runner);
 …
         // Just before returning to the processor, set the processor coroutine to active
         proc_cor->state = Active;
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p finished running thread %p\n", this, thrd_dst);
         /* paranoid */ verify( ! __preemption_enabled() );
 …
         // We found a processor, wake it up
+        post( p->idle );
+        eventfd_t val;
+        val = 1;
+        eventfd_write( p->idle, val );
         #if !defined(__CFA_NO_STATISTICS__)
 …
         disable_interrupts();
                 /* paranoid */ verify( ! __preemption_enabled() );
+                post( this->idle );
+                eventfd_t val;
+                val = 1;
+                eventfd_write( this->idle, val );
         enable_interrupts( __cfaabi_dbg_ctx );
+}
 …
 // Unexpected Terminating logic
 //=============================================================================================
+static __spinlock_t kernel_abort_lock;
+static bool kernel_abort_called = false;
+void * kernel_abort(void) __attribute__ ((__nothrow__)) {
+        // abort cannot be recursively entered by the same or different processors because all signal handlers return when
+        // the globalAbort flag is true.
+        lock( kernel_abort_lock __cfaabi_dbg_ctx2 );
+        // disable interrupts, it no longer makes sense to try to interrupt this processor
+        disable_interrupts();
+        // first task to abort ?
+        if ( kernel_abort_called ) {                    // not first task to abort ?
+                unlock( kernel_abort_lock );
+                sigset_t mask;
+                sigemptyset( &mask );
+                sigaddset( &mask, SIGALRM );            // block SIGALRM signals
+                sigaddset( &mask, SIGUSR1 );            // block SIGALRM signals
+                sigsuspend( &mask );                            // block the processor to prevent further damage during abort
+                _exit( EXIT_FAILURE );                          // if processor unblocks before it is killed, terminate it
+        }
+        else {
+                kernel_abort_called = true;
+                unlock( kernel_abort_lock );
+        }
+        return __cfaabi_tls.this_thread;
+}
+void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
+        $thread * thrd = ( $thread * ) kernel_data;
+void __kernel_abort_msg( char * abort_text, int abort_text_size ) {
+        $thread * thrd = __cfaabi_tls.this_thread;
         if(thrd) {
 …
+}
 int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
         return get_coroutine(kernelTLS().this_thread) == get_coroutine(mainThread) ? 4 : 2;
+int __kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
+        return get_coroutine(__cfaabi_tls.this_thread) == get_coroutine(mainThread) ? 4 : 2;
+}
 …
 // Kernel Utilities
 //=============================================================================================
+//-----------------------------------------------------------------------------
+// Locks
+void  ?{}( semaphore & this, int count = 1 ) {
+        (this.lock){};
+        this.count = count;
+        (this.waiting){};
+}
+void ^?{}(semaphore & this) {}
+bool P(semaphore & this) with( this ){
+        lock( lock __cfaabi_dbg_ctx2 );
+        count -= 1;
+        if ( count < 0 ) {
+                // queue current task
+                append( waiting, active_thread() );
+                // atomically release spin lock and block
+                unlock( lock );
+                park();
+                return true;
+        }
+        else {
+            unlock( lock );
+            return false;
+        }
+}
+bool V(semaphore & this) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        count += 1;
+        if ( count <= 0 ) {
+                // remove task at head of waiting list
+                thrd = pop_head( waiting );
+        }
+        unlock( lock );
+        // make new owner
+        unpark( thrd );
+        return thrd != 0p;
+}
+bool V(semaphore & this, unsigned diff) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        int release = max(-count, (int)diff);
+        count += diff;
+        for(release) {
+                unpark( pop_head( waiting ) );
+        }
+        unlock( lock );
+        return thrd != 0p;
+#if defined(CFA_HAVE_LINUX_IO_URING_H)
+#include "io/types.hfa"
+#endif
+static inline void __maybe_io_drain( processor * proc ) {
+        #if defined(CFA_HAVE_LINUX_IO_URING_H)
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p checking io for ring %d\n", proc, proc->io.ctx->fd);
+                // Check if we should drain the queue
+                $io_context * ctx = proc->io.ctx;
+                unsigned head = *ctx->cq.head;
+                unsigned tail = *ctx->cq.tail;
+                if(head != tail) __cfa_io_drain( proc );
+        #endif
+}

libcfa/src/concurrency/kernel.hfa

-              r342af53
+              r8e4aa05
 // file "LICENCE" distributed with Cforall.
 //
 // kernel --
+// kernel -- Header containing the core of the kernel API
 //
 // Author           : Thierry Delisle
 …
 extern "C" {
         #include <bits/pthreadtypes.h>
+        #include <pthread.h>
         #include <linux/types.h>
+}
+//-----------------------------------------------------------------------------
+// Locks
+struct semaphore {
+        __spinlock_t lock;
+        int count;
+        __queue_t($thread) waiting;
+};
+void  ?{}(semaphore & this, int count = 1);
+void ^?{}(semaphore & this);
+bool   P (semaphore & this);
+bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);
+#ifdef __CFA_WITH_VERIFY__
+        extern bool __cfaabi_dbg_in_kernel();
+#endif
+//-----------------------------------------------------------------------------
+// I/O
+struct cluster;
+struct $io_context;
+struct $io_arbiter;
+struct io_context_params {
+        int num_entries;
+};
+void  ?{}(io_context_params & this);
 //-----------------------------------------------------------------------------
 …
         pthread_t kernel_thread;
+        struct {
+                $io_context * ctx;
+                bool pending;
+                bool dirty;
+        } io;
         // Preemption data
         // Node which is added in the discrete event simulaiton
 …
         // Idle lock (kernel semaphore)
         __bin_sem_t idle;
+        int idle;
         // Termination synchronisation (user semaphore)
         semaphore terminated;
+        oneshot terminated;
         // pthread Stack
 …
 DLISTED_MGD_IMPL_OUT(processor)
-//-----------------------------------------------------------------------------
-// I/O
-struct __io_data;
-// IO poller user-thread
-// Not using the "thread" keyword because we want to control
-// more carefully when to start/stop it
-struct $io_ctx_thread {
-        struct __io_data * ring;
-        single_sem sem;
-        volatile bool done;
-        $thread self;
-};
-struct io_context {
-        $io_ctx_thread thrd;
-};
-struct io_context_params {
-        int num_entries;
-        int num_ready;
-        int submit_aff;
-        bool eager_submits:1;
-        bool poller_submits:1;
-        bool poll_submit:1;
-        bool poll_complete:1;
-};
-void  ?{}(io_context_params & this);
-void  ?{}(io_context & this, struct cluster & cl);
-void  ?{}(io_context & this, struct cluster & cl, const io_context_params & params);
-void ^?{}(io_context & this);
-struct io_cancellation {
-        __u64 target;
-};
-static inline void  ?{}(io_cancellation & this) { this.target = -1u; }
-static inline void ^?{}(io_cancellation &) {}
-bool cancel(io_cancellation & this);
 //-----------------------------------------------------------------------------
 …
         struct {
                 io_context * ctxs;
                 unsigned cnt;
+                $io_arbiter * arbiter;
+                io_context_params params;
         } io;

libcfa/src/concurrency/kernel/fwd.hfa

-              r342af53
+              r8e4aa05
 // file "LICENCE" distributed with Cforall.
 //
+// kernel/fwd.hfa --
+// kernel/fwd.hfa -- PUBLIC
+// Fundamental code needed to implement threading M.E.S. algorithms.
 //
 // Author           : Thierry Delisle
 …
                 extern uint64_t thread_rand();
+                // Semaphore which only supports a single thread
+                struct single_sem {
+                        struct $thread * volatile ptr;
+                };
+                static inline {
+                        void  ?{}(single_sem & this) {
+                                this.ptr = 0p;
+                        }
+                        void ^?{}(single_sem &) {}
+                        bool wait(single_sem & this) {
+                                for() {
+                                        struct $thread * expected = this.ptr;
+                                        if(expected == 1p) {
+                                                if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                        return false;
+                                                }
+                                        }
+                                        else {
+                                                /* paranoid */ verify( expected == 0p );
+                                                if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                        park();
+                                                        return true;
+                                                }
+                                        }
+                                }
+                        }
+                        bool post(single_sem & this) {
+                                for() {
+                                        struct $thread * expected = this.ptr;
+                                        if(expected == 1p) return false;
+                                        if(expected == 0p) {
+                                                if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                        return false;
+                                                }
+                                        }
+                                        else {
+                                                if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                        unpark( expected );
+                                                        return true;
+                                                }
+                                        }
+                                }
+                        }
+                }
+                // Synchronozation primitive which only supports a single thread and one post
+                // Similar to a binary semaphore with a 'one shot' semantic
+                // is expected to be discarded after each party call their side
+                struct oneshot {
+                        // Internal state :
+                        //     0p     : is initial state (wait will block)
+                        //     1p     : fulfilled (wait won't block)
+                        // any thread : a thread is currently waiting
+                        struct $thread * volatile ptr;
+                };
+                static inline {
+                        void  ?{}(oneshot & this) {
+                                this.ptr = 0p;
+                        }
+                        void ^?{}(oneshot &) {}
+                        // Wait for the post, return immidiately if it already happened.
+                        // return true if the thread was parked
+                        bool wait(oneshot & this) {
+                                for() {
+                                        struct $thread * expected = this.ptr;
+                                        if(expected == 1p) return false;
+                                        /* paranoid */ verify( expected == 0p );
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                park();
+                                                /* paranoid */ verify( this.ptr == 1p );
+                                                return true;
+                                        }
+                                }
+                        }
+                        // Mark as fulfilled, wake thread if needed
+                        // return true if a thread was unparked
+                        bool post(oneshot & this) {
+                                struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+                                if( got == 0p ) return false;
+                                unpark( got );
+                                return true;
+                        }
+                }
+                // base types for future to build upon
+                // It is based on the 'oneshot' type to allow multiple futures
+                // to block on the same instance, permitting users to block a single
+                // thread on "any of" [a given set of] futures.
+                // does not support multiple threads waiting on the same future
+                struct future_t {
+                        // Internal state :
+                        //     0p      : is initial state (wait will block)
+                        //     1p      : fulfilled (wait won't block)
+                        //     2p      : in progress ()
+                        //     3p      : abandoned, server should delete
+                        // any oneshot : a context has been setup to wait, a thread could wait on it
+                        struct oneshot * volatile ptr;
+                };
+                static inline {
+                        void  ?{}(future_t & this) {
+                                this.ptr = 0p;
+                        }
+                        void ^?{}(future_t &) {}
+                        void reset(future_t & this) {
+                                // needs to be in 0p or 1p
+                                __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                        }
+                        // check if the future is available
+                        bool available( future_t & this ) {
+                                return this.ptr == 1p;
+                        }
+                        // Prepare the future to be waited on
+                        // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+                        bool setup( future_t & this, oneshot & wait_ctx ) {
+                                /* paranoid */ verify( wait_ctx.ptr == 0p );
+                                // The future needs to set the wait context
+                                for() {
+                                        struct oneshot * expected = this.ptr;
+                                        // Is the future already fulfilled?
+                                        if(expected == 1p) return false; // Yes, just return false (didn't block)
+                                        // The future is not fulfilled, try to setup the wait context
+                                        /* paranoid */ verify( expected == 0p );
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                return true;
+                                        }
+                                }
+                        }
+                        // Stop waiting on a future
+                        // When multiple futures are waited for together in "any of" pattern
+                        // futures that weren't fulfilled before the thread woke up
+                        // should retract the wait ctx
+                        // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+                        void retract( future_t & this, oneshot & wait_ctx ) {
+                                // Remove the wait context
+                                struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                                // got == 0p: future was never actually setup, just return
+                                if( got == 0p ) return;
+                                // got == wait_ctx: since fulfil does an atomic_swap,
+                                // if we got back the original then no one else saw context
+                                // It is safe to delete (which could happen after the return)
+                                if( got == &wait_ctx ) return;
+                                // got == 1p: the future is ready and the context was fully consumed
+                                // the server won't use the pointer again
+                                // It is safe to delete (which could happen after the return)
+                                if( got == 1p ) return;
+                                // got == 2p: the future is ready but the context hasn't fully been consumed
+                                // spin until it is safe to move on
+                                if( got == 2p ) {
+                                        while( this.ptr != 1p ) Pause();
+                                        return;
+                                }
+                                // got == any thing else, something wen't wrong here, abort
+                                abort("Future in unexpected state");
+                        }
+                        // Mark the future as abandoned, meaning it will be deleted by the server
+                        bool abandon( future_t & this ) {
+                                /* paranoid */ verify( this.ptr != 3p );
+                                // Mark the future as abandonned
+                                struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
+                                // If the future isn't already fulfilled, let the server delete it
+                                if( got == 0p ) return false;
+                                // got == 2p: the future is ready but the context hasn't fully been consumed
+                                // spin until it is safe to move on
+                                if( got == 2p ) {
+                                        while( this.ptr != 1p ) Pause();
+                                        got = 1p;
+                                }
+                                // The future is completed delete it now
+                                /* paranoid */ verify( this.ptr != 1p );
+                                free( &this );
+                                return true;
+                        }
+                        // from the server side, mark the future as fulfilled
+                        // delete it if needed
+                        bool fulfil( future_t & this ) {
+                                for() {
+                                        struct oneshot * expected = this.ptr;
+                                        // was this abandoned?
+                                        #if defined(__GNUC__) && __GNUC__ >= 7
+                                                #pragma GCC diagnostic push
+                                                #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+                                        #endif
+                                                if( expected == 3p ) { free( &this ); return false; }
+                                        #if defined(__GNUC__) && __GNUC__ >= 7
+                                                #pragma GCC diagnostic pop
+                                        #endif
+                                        /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
+                                        /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
+                                        // If there is a wait context, we need to consume it and mark it as consumed after
+                                        // If there is no context then we can skip the in progress phase
+                                        struct oneshot * want = expected == 0p ? 1p : 2p;
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
+                                                bool ret = post( *expected );
+                                                __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+                                                return ret;
+                                        }
+                                }
+                        }
+                        // Wait for the future to be fulfilled
+                        bool wait( future_t & this ) {
+                                oneshot temp;
+                                if( !setup(this, temp) ) return false;
+                                // Wait context is setup, just wait on it
+                                bool ret = wait( temp );
+                                // Wait for the future to tru
+                                while( this.ptr == 2p ) Pause();
+                                // Make sure the state makes sense
+                                // Should be fulfilled, could be in progress but it's out of date if so
+                                // since if that is the case, the oneshot was fulfilled (unparking this thread)
+                                // and the oneshot should not be needed any more
+                                __attribute__((unused)) struct oneshot * was = this.ptr;
+                                /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
+                                // Mark the future as fulfilled, to be consistent
+                                // with potential calls to avail
+                                // this.ptr = 1p;
+                                return ret;
+                        }
+                }
                 //-----------------------------------------------------------------------
                 // Statics call at the end of each thread to register statistics

libcfa/src/concurrency/kernel/startup.cfa

-              r342af53
+              r8e4aa05
 extern "C" {
       #include <limits.h>       // PTHREAD_STACK_MIN
+        #include <sys/eventfd.h>  // eventfd
       #include <sys/mman.h>     // mprotect
       #include <sys/resource.h> // getrlimit
 …
 extern void __kernel_alarm_startup(void);
 extern void __kernel_alarm_shutdown(void);
-extern void __kernel_io_startup (void);
-extern void __kernel_io_shutdown(void);
 //-----------------------------------------------------------------------------
 …
 KERNEL_STORAGE($thread,              mainThread);
 KERNEL_STORAGE(__stack_t,            mainThreadCtx);
-KERNEL_STORAGE(io_context,           mainPollerThread);
 KERNEL_STORAGE(__scheduler_RWLock_t, __scheduler_lock);
 #if !defined(__CFA_NO_STATISTICS__)
 …
         void ?{}(processor & this) with( this ) {
+                ( this.idle ){};
+                ( this.terminated ){ 0 };
+                ( this.terminated ){};
                 ( this.runner ){};
                 init( this, "Main Processor", *mainCluster );
 …
         __kernel_alarm_startup();
-        // Start IO
-        __kernel_io_startup();
         // Add the main thread to the ready queue
         // once resume is called on mainProcessor->runner the mainThread needs to be scheduled like any normal thread
 …
         // THE SYSTEM IS NOW COMPLETELY RUNNING
-        // SKULLDUGGERY: The constructor for the mainCluster will call alloc with a dimension of 0
-        // malloc *can* return a non-null value, we should free it if that is the case
-        free( mainCluster->io.ctxs );
-        // Now that the system is up, finish creating systems that need threading
-        mainCluster->io.ctxs = (io_context *)&storage_mainPollerThread;
-        mainCluster->io.cnt  = 1;
-        (*mainCluster->io.ctxs){ *mainCluster };
         __cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
 …
 static void __kernel_shutdown(void) {
-        //Before we start shutting things down, wait for systems that need threading to shutdown
-        ^(*mainCluster->io.ctxs){};
-        mainCluster->io.cnt  = 0;
-        mainCluster->io.ctxs = 0p;
         /* paranoid */ verify( __preemption_enabled() );
         disable_interrupts();
 …
         // Disable preemption
         __kernel_alarm_shutdown();
-        // Stop IO
-        __kernel_io_shutdown();
         // Destroy the main processor and its context in reverse order of construction
 …
         pending_preemption = false;
+        this.io.ctx = 0p;
+        this.io.pending = false;
+        this.io.dirty   = false;
+        this.idle = eventfd(0, 0);
+        if (idle < 0) {
+                abort("KERNEL ERROR: PROCESSOR EVENTFD - %s\n", strerror(errno));
+        }
         #if !defined(__CFA_NO_STATISTICS__)
                 print_stats = 0;
 …
         // Finally we don't need the read_lock any more
         unregister((__processor_id_t*)&this);
+        close(this.idle);
+}
 void ?{}(processor & this, const char name[], cluster & _cltr) {
+        ( this.idle ){};
+        ( this.terminated ){ 0 };
+        ( this.terminated ){};
         ( this.runner ){};
 …
                 __wake_proc( &this );
                 P( terminated );
+                wait( terminated );
                 /* paranoid */ verify( active_processor() != &this);
+        }
 …
         threads{ __get };
+        io.arbiter = create();
+        io.params = io_params;
         doregister(this);
 …
         ready_mutate_unlock( last_size );
         enable_interrupts_noPoll(); // Don't poll, could be in main cluster
-        this.io.cnt  = num_io;
-        this.io.ctxs = aalloc(num_io);
-        for(i; this.io.cnt) {
-                (this.io.ctxs[i]){ this, io_params };
+        }
+}
 void ^?{}(cluster & this) {
+        for(i; this.io.cnt) {
+                ^(this.io.ctxs[i]){ true };
+        }
+        free(this.io.ctxs);
+        destroy(this.io.arbiter);
         // Lock the RWlock so no-one pushes/pops while we are changing the queue
 …
+}
 #if defined(__CFA_WITH_VERIFY__)
 static bool verify_fwd_bck_rng(void) {

libcfa/src/concurrency/kernel_private.hfa

-              r342af53
+              r8e4aa05
 //-----------------------------------------------------------------------------
 // I/O
+void ^?{}(io_context & this, bool );
+$io_arbiter * create(void);
+void destroy($io_arbiter *);
 //=======================================================================

libcfa/src/concurrency/locks.cfa

-              r342af53
+              r8e4aa05
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// locks.hfa -- LIBCFATHREAD
+// Runtime locks that used with the runtime thread system.
+//
+// Author           : Colby Alexander Parsons
+// Created On       : Thu Jan 21 19:46:50 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+#define __cforall_thread__
 #include "locks.hfa"
 #include "kernel_private.hfa"
 …
 //-----------------------------------------------------------------------------
 // info_thread
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         struct info_thread {
                 // used to put info_thread on a dl queue (aka sequence)
 …
 void ^?{}( blocking_lock & this ) {}
+void  ?{}( single_acquisition_lock & this ) {((blocking_lock &)this){ false, false };}
+void ^?{}( single_acquisition_lock & this ) {}
+void  ?{}( owner_lock & this ) {((blocking_lock &)this){ true, true };}
+void ^?{}( owner_lock & this ) {}
+void  ?{}( multiple_acquisition_lock & this ) {((blocking_lock &)this){ true, false };}
+void ^?{}( multiple_acquisition_lock & this ) {}
 void lock( blocking_lock & this ) with( this ) {
 …
 //-----------------------------------------------------------------------------
-// Overloaded routines for traits
-// These routines are temporary until an inheritance bug is fixed
-void   lock      ( single_acquisition_lock & this ) { lock   ( (blocking_lock &)this ); }
-void   unlock    ( single_acquisition_lock & this ) { unlock ( (blocking_lock &)this ); }
-void   on_wait   ( single_acquisition_lock & this ) { on_wait( (blocking_lock &)this ); }
-void   on_notify ( single_acquisition_lock & this, struct $thread * t ) { on_notify( (blocking_lock &)this, t ); }
-void   set_recursion_count( single_acquisition_lock & this, size_t recursion ) { set_recursion_count( (blocking_lock &)this, recursion ); }
-size_t get_recursion_count( single_acquisition_lock & this ) { return get_recursion_count( (blocking_lock &)this ); }
-void   lock     ( owner_lock & this ) { lock   ( (blocking_lock &)this ); }
-void   unlock   ( owner_lock & this ) { unlock ( (blocking_lock &)this ); }
-void   on_wait  ( owner_lock & this ) { on_wait( (blocking_lock &)this ); }
-void   on_notify( owner_lock & this, struct $thread * t ) { on_notify( (blocking_lock &)this, t ); }
-void   set_recursion_count( owner_lock & this, size_t recursion ) { set_recursion_count( (blocking_lock &)this, recursion ); }
-size_t get_recursion_count( owner_lock & this ) { return get_recursion_count( (blocking_lock &)this ); }
-void   lock     ( multiple_acquisition_lock & this ) { lock   ( (blocking_lock &)this ); }
-void   unlock   ( multiple_acquisition_lock & this ) { unlock ( (blocking_lock &)this ); }
-void   on_wait  ( multiple_acquisition_lock & this ) { on_wait( (blocking_lock &)this ); }
-void   on_notify( multiple_acquisition_lock & this, struct $thread * t ){ on_notify( (blocking_lock &)this, t ); }
-void   set_recursion_count( multiple_acquisition_lock & this, size_t recursion ){ set_recursion_count( (blocking_lock &)this, recursion ); }
-size_t get_recursion_count( multiple_acquisition_lock & this ){ return get_recursion_count( (blocking_lock &)this ); }
-//-----------------------------------------------------------------------------
 // alarm node wrapper
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         struct alarm_node_wrap {
                 alarm_node_t alarm_node;
 …
 //-----------------------------------------------------------------------------
 // condition variable
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         void ?{}( condition_variable(L) & this ){
 …
         bool wait( condition_variable(L) & this, L & l, uintptr_t info, Time time         ) with(this) { WAIT_TIME( info, &l , time ) }
+}
+//-----------------------------------------------------------------------------
+// Semaphore
+void  ?{}( semaphore & this, int count = 1 ) {
+        (this.lock){};
+        this.count = count;
+        (this.waiting){};
+}
+void ^?{}(semaphore & this) {}
+bool P(semaphore & this) with( this ){
+        lock( lock __cfaabi_dbg_ctx2 );
+        count -= 1;
+        if ( count < 0 ) {
+                // queue current task
+                append( waiting, active_thread() );
+                // atomically release spin lock and block
+                unlock( lock );
+                park();
+                return true;
+        }
+        else {
+            unlock( lock );
+            return false;
+        }
+}
+bool V(semaphore & this) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        count += 1;
+        if ( count <= 0 ) {
+                // remove task at head of waiting list
+                thrd = pop_head( waiting );
+        }
+        unlock( lock );
+        // make new owner
+        unpark( thrd );
+        return thrd != 0p;
+}
+bool V(semaphore & this, unsigned diff) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        int release = max(-count, (int)diff);
+        count += diff;
+        for(release) {
+                unpark( pop_head( waiting ) );
+        }
+        unlock( lock );
+        return thrd != 0p;
+}

libcfa/src/concurrency/locks.hfa

-              r342af53
+              r8e4aa05
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// locks.hfa -- PUBLIC
+// Runtime locks that used with the runtime thread system.
+//
+// Author           : Colby Alexander Parsons
+// Created On       : Thu Jan 21 19:46:50 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
 #pragma once
 #include <stdbool.h>
+#include "bits/locks.hfa"
+#include "bits/sequence.hfa"
+#include "invoke.h"
+#include "bits/weakso_locks.hfa"
 #include "time_t.hfa"
 #include "time.hfa"
+//----------
+struct single_acquisition_lock {
+        inline blocking_lock;
+};
+static inline void  ?{}( single_acquisition_lock & this ) {((blocking_lock &)this){ false, false };}
+static inline void ^?{}( single_acquisition_lock & this ) {}
+static inline void   lock      ( single_acquisition_lock & this ) { lock   ( (blocking_lock &)this ); }
+static inline void   unlock    ( single_acquisition_lock & this ) { unlock ( (blocking_lock &)this ); }
+static inline void   on_wait   ( single_acquisition_lock & this ) { on_wait( (blocking_lock &)this ); }
+static inline void   on_notify ( single_acquisition_lock & this, struct $thread * t ) { on_notify( (blocking_lock &)this, t ); }
+static inline void   set_recursion_count( single_acquisition_lock & this, size_t recursion ) { set_recursion_count( (blocking_lock &)this, recursion ); }
+static inline size_t get_recursion_count( single_acquisition_lock & this ) { return get_recursion_count( (blocking_lock &)this ); }
+//----------
+struct owner_lock {
+        inline blocking_lock;
+};
+static inline void  ?{}( owner_lock & this ) {((blocking_lock &)this){ true, true };}
+static inline void ^?{}( owner_lock & this ) {}
+static inline void   lock     ( owner_lock & this ) { lock   ( (blocking_lock &)this ); }
+static inline void   unlock   ( owner_lock & this ) { unlock ( (blocking_lock &)this ); }
+static inline void   on_wait  ( owner_lock & this ) { on_wait( (blocking_lock &)this ); }
+static inline void   on_notify( owner_lock & this, struct $thread * t ) { on_notify( (blocking_lock &)this, t ); }
+static inline void   set_recursion_count( owner_lock & this, size_t recursion ) { set_recursion_count( (blocking_lock &)this, recursion ); }
+static inline size_t get_recursion_count( owner_lock & this ) { return get_recursion_count( (blocking_lock &)this ); }
 //-----------------------------------------------------------------------------
 // is_blocking_lock
 trait is_blocking_lock(dtype L | sized(L)) {
+trait is_blocking_lock(L & | sized(L)) {
         // For synchronization locks to use when acquiring
         void on_notify( L &, struct $thread * );
 …
 // the info thread is a wrapper around a thread used
 // to store extra data for use in the condition variable
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         struct info_thread;
 …
 //-----------------------------------------------------------------------------
-// Blocking Locks
-struct blocking_lock {
-        // Spin lock used for mutual exclusion
-        __spinlock_t lock;
-        // List of blocked threads
-        Sequence( $thread ) blocked_threads;
-        // Count of current blocked threads
-        size_t wait_count;
-        // Flag if the lock allows multiple acquisition
-        bool multi_acquisition;
-        // Flag if lock can be released by non owner
-        bool strict_owner;
-        // Current thread owning the lock
-        struct $thread * owner;
-        // Number of recursion level
-        size_t recursion_count;
-};
-struct single_acquisition_lock {
-        inline blocking_lock;
-};
-struct owner_lock {
-        inline blocking_lock;
-};
-struct multiple_acquisition_lock {
-        inline blocking_lock;
-};
-void  ?{}( blocking_lock & this, bool multi_acquisition, bool strict_owner );
-void ^?{}( blocking_lock & this );
-void  ?{}( single_acquisition_lock & this );
-void ^?{}( single_acquisition_lock & this );
-void  ?{}( owner_lock & this );
-void ^?{}( owner_lock & this );
-void  ?{}( multiple_acquisition_lock & this );
-void ^?{}( multiple_acquisition_lock & this );
-void lock( blocking_lock & this );
-bool try_lock( blocking_lock & this );
-void unlock( blocking_lock & this );
-void on_notify( blocking_lock & this, struct $thread * t );
-void on_wait( blocking_lock & this );
-size_t wait_count( blocking_lock & this );
-void set_recursion_count( blocking_lock & this, size_t recursion );
-size_t get_recursion_count( blocking_lock & this );
-void lock( single_acquisition_lock & this );
-void unlock( single_acquisition_lock & this );
-void on_notify( single_acquisition_lock & this, struct $thread * t );
-void on_wait( single_acquisition_lock & this );
-void set_recursion_count( single_acquisition_lock & this, size_t recursion );
-size_t get_recursion_count( single_acquisition_lock & this );
-void lock( owner_lock & this );
-void unlock( owner_lock & this );
-void on_notify( owner_lock & this, struct $thread * t );
-void on_wait( owner_lock & this );
-void set_recursion_count( owner_lock & this, size_t recursion );
-size_t get_recursion_count( owner_lock & this );
-void lock( multiple_acquisition_lock & this );
-void unlock( multiple_acquisition_lock & this );
-void on_notify( multiple_acquisition_lock & this, struct $thread * t );
-void on_wait( multiple_acquisition_lock & this );
-void set_recursion_count( multiple_acquisition_lock & this, size_t recursion );
-size_t get_recursion_count( multiple_acquisition_lock & this );
-//-----------------------------------------------------------------------------
 // Synchronization Locks
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         struct condition_variable {
                 // Spin lock used for mutual exclusion
 …
         bool wait( condition_variable(L) & this, L & l, uintptr_t info, Time time );
+}
+//-----------------------------------------------------------------------------
+// Semaphore
+struct semaphore {
+        __spinlock_t lock;
+        int count;
+        __queue_t($thread) waiting;
+};
+void  ?{}(semaphore & this, int count = 1);
+void ^?{}(semaphore & this);
+bool   P (semaphore & this);
+bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);

libcfa/src/concurrency/monitor.cfa

-              r342af53
+              r8e4aa05
 static inline [$thread *, int] search_entry_queue( const __waitfor_mask_t &, $monitor * monitors [], __lock_size_t count );
 forall(dtype T | sized( T ))
+forall(T & | sized( T ))
 static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val );
 static inline __lock_size_t count_max    ( const __waitfor_mask_t & mask );
 …
+}
 forall(dtype T | sized( T ))
+forall(T & | sized( T ))
 static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val ) {
         if( !val ) return size;

libcfa/src/concurrency/monitor.hfa

-              r342af53
+              r8e4aa05
 #include "stdlib.hfa"
 trait is_monitor(dtype T) {
+trait is_monitor(T &) {
         $monitor * get_monitor( T & );
         void ^?{}( T & mutex );
 …
 void ^?{}( monitor_dtor_guard_t & this );
 static inline forall( dtype T | sized(T) | { void ^?{}( T & mutex ); } )
+static inline forall( T & | sized(T) | { void ^?{}( T & mutex ); } )
 void delete( T * th ) {
         ^(*th){};
+        if(th) ^(*th){};
         free( th );
+}

libcfa/src/concurrency/mutex.cfa

-              r342af53
+              r8e4aa05
+}
 forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void wait(condition_variable & this, L & l) {
         lock( this.lock __cfaabi_dbg_ctx2 );
 …
 //-----------------------------------------------------------------------------
 // Scopes
 forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void lock_all  ( L * locks[], size_t count) {
         // Sort locks based on addresses
 …
+}
 forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void unlock_all( L * locks[], size_t count) {
         // Lock all

libcfa/src/concurrency/mutex.hfa

-              r342af53
+              r8e4aa05
 };
 void ?{}(mutex_lock & this);
 void ^?{}(mutex_lock & this);
 void lock(mutex_lock & this);
 bool try_lock(mutex_lock & this);
 void unlock(mutex_lock & this);
+void ?{}(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void ^?{}(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void lock(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+bool try_lock(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void unlock(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 // Exclusive lock - recursive
 …
 };
 void ?{}(recursive_mutex_lock & this);
 void ^?{}(recursive_mutex_lock & this);
 void lock(recursive_mutex_lock & this);
 bool try_lock(recursive_mutex_lock & this);
 void unlock(recursive_mutex_lock & this);
+void ?{}(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void ^?{}(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void lock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+bool try_lock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void unlock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 trait is_lock(dtype L | sized(L)) {
+trait is_lock(L & | sized(L)) {
         void lock  (L &);
         void unlock(L &);
 …
 };
 void ?{}(condition_variable & this);
 void ^?{}(condition_variable & this);
+void ?{}(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void ^?{}(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 void notify_one(condition_variable & this);
 void notify_all(condition_variable & this);
+void notify_one(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void notify_all(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 void wait(condition_variable & this);
+void wait(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 forall(dtype L | is_lock(L))
 void wait(condition_variable & this, L & l);
+forall(L & | is_lock(L))
+void wait(condition_variable & this, L & l) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 //-----------------------------------------------------------------------------
 // Scopes
 forall(dtype L | is_lock(L)) {
+forall(L & | is_lock(L)) {
         #if !defined( __TUPLE_ARRAYS_EXIST__ )
         void lock  ( L * locks [], size_t count);

libcfa/src/concurrency/preemption.cfa

-              r342af53
+              r8e4aa05
 static void timeout( $thread * this ) {
         unpark( this );
+}
+void __disable_interrupts_hard() {
+        sigset_t oldset;
+        int ret;
+        ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+        if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
+        ret = sigismember(&oldset, SIGUSR1);
+        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+        if(ret == 1) { abort("ERROR SIGUSR1 is disabled"); }
+        ret = sigismember(&oldset, SIGALRM);
+        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+        if(ret == 0) { abort("ERROR SIGALRM is enabled"); }
+        signal_block( SIGUSR1 );
+}
+void __enable_interrupts_hard() {
+        signal_unblock( SIGUSR1 );
+        sigset_t oldset;
+        int ret;
+        ret = pthread_sigmask(0, ( const sigset_t * ) 0p, &oldset);  // workaround trac#208: cast should be unnecessary
+        if(ret != 0) { abort("ERROR sigprocmask returned %d", ret); }
+        ret = sigismember(&oldset, SIGUSR1);
+        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+        if(ret == 1) { abort("ERROR SIGUSR1 is disabled"); }
+        ret = sigismember(&oldset, SIGALRM);
+        if(ret <  0) { abort("ERROR sigismember returned %d", ret); }
+        if(ret == 0) { abort("ERROR SIGALRM is enabled"); }
+}
 …
         // Setup proper signal handlers
         __cfaabi_sigaction( SIGUSR1, sigHandler_ctxSwitch, SA_SIGINFO | SA_RESTART ); // __cfactx_switch handler
         __cfaabi_sigaction( SIGALRM, sigHandler_alarm    , SA_SIGINFO | SA_RESTART ); // debug handler
+        __cfaabi_sigaction( SIGUSR1, sigHandler_ctxSwitch, SA_SIGINFO ); // __cfactx_switch handler
+        __cfaabi_sigaction( SIGALRM, sigHandler_alarm    , SA_SIGINFO ); // debug handler
         signal_block( SIGALRM );
 …
         __cfaabi_dbg_print_safe( "Kernel : Preemption stopped\n" );
+}
+// Prevent preemption since we are about to start terminating things
+void __kernel_abort_lock(void) {
+        signal_block( SIGUSR1 );
+}

libcfa/src/concurrency/ready_queue.cfa

r342af53	r8e4aa05
330	330	#if defined(BIAS)
331	331	// Don't bother trying locally too much
332		~~int local_tries = 8;~~
333	332	preferred = kernelTLS().this_processor->id * 4;
334	333	#endif

libcfa/src/concurrency/stats.cfa

-              r342af53
+              r8e4aa05
                 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+                        stats->io.submit_q.submit_avg.rdy = 0;
+                        stats->io.submit_q.submit_avg.csm = 0;
+                        stats->io.submit_q.submit_avg.cnt = 0;
+                        stats->io.submit_q.look_avg.val   = 0;
+                        stats->io.submit_q.look_avg.cnt   = 0;
+                        stats->io.submit_q.look_avg.block = 0;
+                        stats->io.submit_q.alloc_avg.val   = 0;
+                        stats->io.submit_q.alloc_avg.cnt   = 0;
+                        stats->io.submit_q.alloc_avg.block = 0;
+                        stats->io.submit_q.helped = 0;
+                        stats->io.submit_q.leader = 0;
+                        stats->io.submit_q.busy   = 0;
+                        stats->io.complete_q.completed_avg.val = 0;
+                        stats->io.complete_q.completed_avg.cnt = 0;
+                        stats->io.complete_q.blocks = 0;
+                        stats->io.alloc.fast        = 0;
+                        stats->io.alloc.slow        = 0;
+                        stats->io.alloc.fail        = 0;
+                        stats->io.alloc.revoke      = 0;
+                        stats->io.alloc.block       = 0;
+                        stats->io.submit.fast       = 0;
+                        stats->io.submit.slow       = 0;
+                        stats->io.flush.external    = 0;
+                        stats->io.calls.flush       = 0;
+                        stats->io.calls.submitted   = 0;
+                        stats->io.calls.drain       = 0;
+                        stats->io.calls.completed   = 0;
+                        stats->io.calls.errors.busy = 0;
+                        stats->io.poller.sleeps     = 0;
                 #endif
+        }
 …
                 #if defined(CFA_HAVE_LINUX_IO_URING_H)
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.rdy     , proc->io.submit_q.submit_avg.rdy     , __ATOMIC_SEQ_CST ); proc->io.submit_q.submit_avg.rdy      = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.csm     , proc->io.submit_q.submit_avg.csm     , __ATOMIC_SEQ_CST ); proc->io.submit_q.submit_avg.csm      = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.avl     , proc->io.submit_q.submit_avg.avl     , __ATOMIC_SEQ_CST ); proc->io.submit_q.submit_avg.avl      = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.submit_avg.cnt     , proc->io.submit_q.submit_avg.cnt     , __ATOMIC_SEQ_CST ); proc->io.submit_q.submit_avg.cnt      = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.val       , proc->io.submit_q.look_avg.val       , __ATOMIC_SEQ_CST ); proc->io.submit_q.look_avg.val        = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.cnt       , proc->io.submit_q.look_avg.cnt       , __ATOMIC_SEQ_CST ); proc->io.submit_q.look_avg.cnt        = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.look_avg.block     , proc->io.submit_q.look_avg.block     , __ATOMIC_SEQ_CST ); proc->io.submit_q.look_avg.block      = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.val      , proc->io.submit_q.alloc_avg.val      , __ATOMIC_SEQ_CST ); proc->io.submit_q.alloc_avg.val       = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.cnt      , proc->io.submit_q.alloc_avg.cnt      , __ATOMIC_SEQ_CST ); proc->io.submit_q.alloc_avg.cnt       = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.alloc_avg.block    , proc->io.submit_q.alloc_avg.block    , __ATOMIC_SEQ_CST ); proc->io.submit_q.alloc_avg.block     = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.helped             , proc->io.submit_q.helped             , __ATOMIC_SEQ_CST ); proc->io.submit_q.helped              = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.leader             , proc->io.submit_q.leader             , __ATOMIC_SEQ_CST ); proc->io.submit_q.leader              = 0;
+                        __atomic_fetch_add( &cltr->io.submit_q.busy               , proc->io.submit_q.busy               , __ATOMIC_SEQ_CST ); proc->io.submit_q.busy                = 0;
+                        __atomic_fetch_add( &cltr->io.complete_q.completed_avg.val, proc->io.complete_q.completed_avg.val, __ATOMIC_SEQ_CST ); proc->io.complete_q.completed_avg.val = 0;
+                        __atomic_fetch_add( &cltr->io.complete_q.completed_avg.cnt, proc->io.complete_q.completed_avg.cnt, __ATOMIC_SEQ_CST ); proc->io.complete_q.completed_avg.cnt = 0;
+                        __atomic_fetch_add( &cltr->io.complete_q.blocks           , proc->io.complete_q.blocks           , __ATOMIC_SEQ_CST ); proc->io.complete_q.blocks            = 0;
+                        __atomic_fetch_add( &cltr->io.alloc.fast       , proc->io.alloc.fast       , __ATOMIC_SEQ_CST ); proc->io.alloc.fast        = 0;
+                        __atomic_fetch_add( &cltr->io.alloc.slow       , proc->io.alloc.slow       , __ATOMIC_SEQ_CST ); proc->io.alloc.slow        = 0;
+                        __atomic_fetch_add( &cltr->io.alloc.fail       , proc->io.alloc.fail       , __ATOMIC_SEQ_CST ); proc->io.alloc.fail        = 0;
+                        __atomic_fetch_add( &cltr->io.alloc.revoke     , proc->io.alloc.revoke     , __ATOMIC_SEQ_CST ); proc->io.alloc.revoke      = 0;
+                        __atomic_fetch_add( &cltr->io.alloc.block      , proc->io.alloc.block      , __ATOMIC_SEQ_CST ); proc->io.alloc.block       = 0;
+                        __atomic_fetch_add( &cltr->io.submit.fast      , proc->io.submit.fast      , __ATOMIC_SEQ_CST ); proc->io.submit.fast       = 0;
+                        __atomic_fetch_add( &cltr->io.submit.slow      , proc->io.submit.slow      , __ATOMIC_SEQ_CST ); proc->io.submit.slow       = 0;
+                        __atomic_fetch_add( &cltr->io.flush.external   , proc->io.flush.external   , __ATOMIC_SEQ_CST ); proc->io.flush.external    = 0;
+                        __atomic_fetch_add( &cltr->io.calls.flush      , proc->io.calls.flush      , __ATOMIC_SEQ_CST ); proc->io.calls.flush       = 0;
+                        __atomic_fetch_add( &cltr->io.calls.submitted  , proc->io.calls.submitted  , __ATOMIC_SEQ_CST ); proc->io.calls.submitted   = 0;
+                        __atomic_fetch_add( &cltr->io.calls.drain      , proc->io.calls.drain      , __ATOMIC_SEQ_CST ); proc->io.calls.drain       = 0;
+                        __atomic_fetch_add( &cltr->io.calls.completed  , proc->io.calls.completed  , __ATOMIC_SEQ_CST ); proc->io.calls.completed   = 0;
+                        __atomic_fetch_add( &cltr->io.calls.errors.busy, proc->io.calls.errors.busy, __ATOMIC_SEQ_CST ); proc->io.calls.errors.busy = 0;
+                        __atomic_fetch_add( &cltr->io.poller.sleeps    , proc->io.poller.sleeps    , __ATOMIC_SEQ_CST ); proc->io.poller.sleeps     = 0;
                 #endif
+        }
 …
                 if( flags & CFA_STATS_READY_Q ) {
-                        double push_sur = (100.0 * ((double)ready.pick.push.success) / ready.pick.push.attempt);
-                        double pop_sur  = (100.0 * ((double)ready.pick.pop .success) / ready.pick.pop .attempt);
                         double push_len = ((double)ready.pick.push.attempt) / ready.pick.push.success;
                         double pop_len  = ((double)ready.pick.pop .attempt) / ready.pick.pop .success;
-                        double lpush_sur = (100.0 * ((double)ready.pick.push.lsuccess) / ready.pick.push.local);
-                        double lpop_sur  = (100.0 * ((double)ready.pick.pop .lsuccess) / ready.pick.pop .local);
                         double lpush_len = ((double)ready.pick.push.local) / ready.pick.push.lsuccess;
 …
                         __cfaabi_bits_print_safe( STDOUT_FILENO,
                                 "----- %s \"%s\" (%p) - Ready Q Stats -----\n"
+                                "- total threads run      : %'15" PRIu64 "\n"
+                                "- total threads scheduled: %'15" PRIu64 "\n"
+                                "- push average probe len : %'18.2lf, %'18.2lf%% (%'15" PRIu64 " attempts)\n"
+                                "- pop  average probe len : %'18.2lf, %'18.2lf%% (%'15" PRIu64 " attempts)\n"
+                                "- local push avg prb len : %'18.2lf, %'18.2lf%% (%'15" PRIu64 " attempts)\n"
+                                "- local pop  avg prb len : %'18.2lf, %'18.2lf%% (%'15" PRIu64 " attempts)\n"
+                                "- thread migrations      : %'15" PRIu64 "\n"
+                                "- Idle Sleep -\n"
+                                "-- halts                 : %'15" PRIu64 "\n"
+                                "-- cancelled halts       : %'15" PRIu64 "\n"
+                                "-- schedule wake         : %'15" PRIu64 "\n"
+                                "-- wake on exit          : %'15" PRIu64 "\n"
+                                "- total threads  : %'15" PRIu64 "run, %'15" PRIu64 "schd (%'" PRIu64 "mig )\n"
+                                "- push avg probe : %'3.2lf, %'3.2lfl (%'15" PRIu64 " attempts, %'15" PRIu64 " locals)\n"
+                                "- pop  avg probe : %'3.2lf, %'3.2lfl (%'15" PRIu64 " attempts, %'15" PRIu64 " locals)\n"
+                                "- Idle Sleep     : %'15" PRIu64 "h, %'15" PRIu64 "c, %'15" PRIu64 "w, %'15" PRIu64 "e\n"
                                 "\n"
                                 , type, name, id
                                 , ready.pick.pop.success
                                 , ready.pick.push.success
-                                , push_len, push_sur, ready.pick.push.attempt
-                                , pop_len , pop_sur , ready.pick.pop .attempt
-                                , lpush_len, lpush_sur, ready.pick.push.local
-                                , lpop_len , lpop_sur , ready.pick.pop .local
                                 , ready.threads.migration
+                                , push_len, lpush_len, ready.pick.push.attempt, ready.pick.push.local
+                                , pop_len , lpop_len , ready.pick.pop .attempt, ready.pick.pop .local
                                 , ready.sleep.halts, ready.sleep.cancels, ready.sleep.wakes, ready.sleep.exits
                         );
 …
                 #if defined(CFA_HAVE_LINUX_IO_URING_H)
                         if( flags & CFA_STATS_IO ) {
                                 double avgrdy = ((double)io.submit_q.submit_avg.rdy) / io.submit_q.submit_avg.cnt;
                                 double avgcsm = ((double)io.submit_q.submit_avg.csm) / io.submit_q.submit_avg.cnt;
+                                uint64_t total_allocs = io.alloc.fast + io.alloc.slow;
+                                double avgfasta = ((double)io.alloc.fast) / total_allocs;
+                                double lavgv = 0;
+                                double lavgb = 0;
+                                if(io.submit_q.look_avg.cnt != 0) {
+                                        lavgv = ((double)io.submit_q.look_avg.val  ) / io.submit_q.look_avg.cnt;
+                                        lavgb = ((double)io.submit_q.look_avg.block) / io.submit_q.look_avg.cnt;
+                                }
+                                uint64_t total_submits = io.submit.fast + io.submit.slow;
+                                double avgfasts = ((double)io.submit.fast) / total_submits;
+                                double aavgv = 0;
+                                double aavgb = 0;
+                                if(io.submit_q.alloc_avg.cnt != 0) {
+                                        aavgv = ((double)io.submit_q.alloc_avg.val  ) / io.submit_q.alloc_avg.cnt;
+                                        aavgb = ((double)io.submit_q.alloc_avg.block) / io.submit_q.alloc_avg.cnt;
+                                }
+                                double avgsubs = ((double)io.calls.submitted) / io.calls.flush;
+                                double avgcomp = ((double)io.calls.completed) / io.calls.drain;
                                 __cfaabi_bits_print_safe( STDOUT_FILENO,
                                         "----- %s \"%s\" (%p) - I/O Stats -----\n"
+                                        "- total submit calls     : %'15" PRIu64 "\n"
+                                        "- avg ready entries      : %'18.2lf\n"
+                                        "- avg submitted entries  : %'18.2lf\n"
+                                        "- total helped entries   : %'15" PRIu64 "\n"
+                                        "- total leader entries   : %'15" PRIu64 "\n"
+                                        "- total busy submit      : %'15" PRIu64 "\n"
+                                        "- total ready search     : %'15" PRIu64 "\n"
+                                        "- avg ready search len   : %'18.2lf\n"
+                                        "- avg ready search block : %'18.2lf\n"
+                                        "- total alloc search     : %'15" PRIu64 "\n"
+                                        "- avg alloc search len   : %'18.2lf\n"
+                                        "- avg alloc search block : %'18.2lf\n"
+                                        "- total wait calls       : %'15" PRIu64 "\n"
+                                        "- avg completion/wait    : %'18.2lf\n"
+                                        "- total completion blocks: %'15" PRIu64 "\n"
+                                        "- total allocations : %'" PRIu64 "f, %'" PRIu64 "s (%'2.2lff) \n"
+                                        "-     failures      : %'" PRIu64 "oom, %'" PRIu64 "rvk, %'" PRIu64 "blk\n"
+                                        "- total submits     : %'" PRIu64 "f, %'" PRIu64 "s (%'2.2lf) \n"
+                                        "- flush external    : %'" PRIu64 "\n"
+                                        "- io_uring_enter    : %'" PRIu64 " (%'" PRIu64 ", %'" PRIu64 " EBUSY)\n"
+                                        "-     submits       : %'" PRIu64 " (%'.2lf) \n"
+                                        "-     completes     : %'" PRIu64 " (%'.2lf) \n"
+                                        "- poller sleeping   : %'" PRIu64 "\n"
                                         "\n"
                                         , type,  name, id
+                                        , io.submit_q.submit_avg.cnt
+                                        , avgrdy, avgcsm
+                                        , io.submit_q.helped, io.submit_q.leader, io.submit_q.busy
+                                        , io.submit_q.look_avg.cnt
+                                        , lavgv, lavgb
+                                        , io.submit_q.alloc_avg.cnt
+                                        , aavgv, aavgb
+                                        , io.complete_q.completed_avg.cnt
+                                        , ((double)io.complete_q.completed_avg.val) / io.complete_q.completed_avg.cnt
+                                        , io.complete_q.blocks
+                                        , io.alloc.fast, io.alloc.slow, avgfasta
+                                        , io.alloc.fail, io.alloc.revoke, io.alloc.block
+                                        , io.submit.fast, io.submit.slow, avgfasts
+                                        , io.flush.external
+                                        , io.calls.flush, io.calls.drain, io.calls.errors.busy
+                                        , io.calls.submitted, avgsubs
+                                        , io.calls.completed, avgcomp
+                                        , io.poller.sleeps
                                 );
+                        }

libcfa/src/concurrency/stats.hfa

-              r342af53
+              r8e4aa05
 #include <stdint.h>
+enum {
+        CFA_STATS_READY_Q  = 0x01,
+        CFA_STATS_IO = 0x02,
+};
 #if defined(__CFA_NO_STATISTICS__)
 …
         static inline void __print_stats( struct __stats_t *, int, const char *, const char *, void * ) {}
 #else
-        enum {
-                CFA_STATS_READY_Q  = 0x01,
-                #if defined(CFA_HAVE_LINUX_IO_URING_H)
-                        CFA_STATS_IO = 0x02,
-                #endif
-        };
         struct __attribute__((aligned(64))) __stats_readQ_t {
 …
                 struct __attribute__((aligned(64))) __stats_io_t{
                         struct {
+                                volatile uint64_t fast;
+                                volatile uint64_t slow;
+                                volatile uint64_t fail;
+                                volatile uint64_t revoke;
+                                volatile uint64_t block;
+                        } alloc;
+                        struct {
+                                volatile uint64_t fast;
+                                volatile uint64_t slow;
+                        } submit;
+                        struct {
+                                volatile uint64_t external;
+                        } flush;
+                        struct {
+                                volatile uint64_t drain;
+                                volatile uint64_t completed;
+                                volatile uint64_t flush;
+                                volatile uint64_t submitted;
                                 struct {
+                                        volatile uint64_t rdy;
+                                        volatile uint64_t csm;
+                                        volatile uint64_t avl;
+                                        volatile uint64_t cnt;
+                                } submit_avg;
+                                struct {
+                                        volatile uint64_t val;
+                                        volatile uint64_t cnt;
+                                        volatile uint64_t block;
+                                } look_avg;
+                                struct {
+                                        volatile uint64_t val;
+                                        volatile uint64_t cnt;
+                                        volatile uint64_t block;
+                                } alloc_avg;
+                                volatile uint64_t helped;
+                                volatile uint64_t leader;
+                                volatile uint64_t busy;
+                        } submit_q;
+                                        volatile uint64_t busy;
+                                } errors;
+                        } calls;
                         struct {
+                                struct {
+                                        volatile uint64_t val;
+                                        volatile uint64_t cnt;
+                                } completed_avg;
+                                volatile uint64_t blocks;
+                        } complete_q;
+                                volatile uint64_t sleeps;
+                        } poller;
                 };
         #endif

libcfa/src/concurrency/thread.cfa

-              r342af53
+              r8e4aa05
+}
 FORALL_DATA_INSTANCE(ThreadCancelled, (dtype thread_t), (thread_t))
+FORALL_DATA_INSTANCE(ThreadCancelled, (thread_t &), (thread_t))
 forall(dtype T)
+forall(T &)
 void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src) {
         dst->virtual_table = src->virtual_table;
 …
+}
 forall(dtype T)
+forall(T &)
 const char * msg(ThreadCancelled(T) *) {
         return "ThreadCancelled";
+}
 forall(dtype T)
+forall(T &)
 static void default_thread_cancel_handler(ThreadCancelled(T) & ) {
         abort( "Unhandled thread cancellation.\n" );
+}
 forall(dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
 void ?{}( thread_dtor_guard_t & this,
                 T & thrd, void(*defaultResumptionHandler)(ThreadCancelled(T) &)) {
         $monitor * m = get_monitor(thrd);
+                T & thrd, void(*cancelHandler)(ThreadCancelled(T) &)) {
+        $monitor * m = get_monitor(thrd);
         $thread * desc = get_thread(thrd);
         // Setup the monitor guard
         void (*dtor)(T& mutex this) = ^?{};
         bool join = defaultResumptionHandler != (void(*)(ThreadCancelled(T)&))0;
+        bool join = cancelHandler != (void(*)(ThreadCancelled(T)&))0;
         (this.mg){&m, (void(*)())dtor, join};
 …
+        }
         desc->state = Cancelled;
+        if (!join) {
+                defaultResumptionHandler = default_thread_cancel_handler;
+        }
+        void(*defaultResumptionHandler)(ThreadCancelled(T) &) =
+                join ? cancelHandler : default_thread_cancel_handler;
         ThreadCancelled(T) except;
 …
 //-----------------------------------------------------------------------------
 // Starting and stopping threads
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 void __thrd_start( T & this, void (*main_p)(T &) ) {
         $thread * this_thrd = get_thread(this);
 …
 //-----------------------------------------------------------------------------
 // Support for threads that don't ues the thread keyword
 forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+forall( T & | sized(T) | is_thread(T) | { void ?{}(T&); } )
 void ?{}( scoped(T)& this ) with( this ) {
         handle{};
 …
+}
 forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+forall( T &, P... | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
 void ?{}( scoped(T)& this, P params ) with( this ) {
         handle{ params };
 …
+}
 forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this ) with( this ) {
         ^handle{};
 …
 //-----------------------------------------------------------------------------
 forall(dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
 T & join( T & this ) {
         thread_dtor_guard_t guard = { this, defaultResumptionHandler };

libcfa/src/concurrency/thread.hfa

-              r342af53
+              r8e4aa05
 //-----------------------------------------------------------------------------
 // thread trait
 trait is_thread(dtype T) {
+trait is_thread(T &) {
         void ^?{}(T& mutex this);
         void main(T& this);
 …
 };
 FORALL_DATA_EXCEPTION(ThreadCancelled, (dtype thread_t), (thread_t)) (
+FORALL_DATA_EXCEPTION(ThreadCancelled, (thread_t &), (thread_t)) (
         thread_t * the_thread;
         exception_t * the_exception;
 );
 forall(dtype T)
+forall(T &)
 void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src);
 forall(dtype T)
+forall(T &)
 const char * msg(ThreadCancelled(T) *);
 …
 // Inline getters for threads/coroutines/monitors
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline $coroutine* get_coroutine(T & this) __attribute__((const)) { return &get_thread(this)->self_cor; }
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline $monitor  * get_monitor  (T & this) __attribute__((const)) { return &get_thread(this)->self_mon; }
 …
 extern struct cluster * mainCluster;
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 void __thrd_start( T & this, void (*)(T &) );
 …
 };
 forall( dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
 void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(ThreadCancelled(T) &) );
 void ^?{}( thread_dtor_guard_t & this );
 …
 // thread runner
 // Structure that actually start and stop threads
 forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 struct scoped {
         T handle;
 };
 forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+forall( T & | sized(T) | is_thread(T) | { void ?{}(T&); } )
 void ?{}( scoped(T)& this );
 forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+forall( T &, P... | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
 void ?{}( scoped(T)& this, P params );
 forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this );
 …
 void unpark( $thread * this );
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline void unpark( T & this ) { if(!&this) return; unpark( get_thread( this ) );}
 …
 //----------
 // join
 forall( dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
 T & join( T & this );

libcfa/src/containers/list.hfa

-              r342af53
+              r8e4aa05
 #define __DLISTED_MGD_JUSTIMPL(STRUCT)
 forall( dtype tE ) {
+forall( tE & ) {
         struct $mgd_link {
                 tE *elem;
 …
                 (this.is_terminator){ 1 };
+        }
         forall ( otype tInit | { void ?{}( $mgd_link(tE) &, tInit); } )
+        forall ( tInit | { void ?{}( $mgd_link(tE) &, tInit); } )
         static inline void ?=?( $mgd_link(tE) &this, tInit i ) {
                 ^?{}( this );
 …
   __DLISTED_MGD_COMMON(STRUCT, STRUCT, $links)
 trait $dlistable(dtype Tnode, dtype Telem) {
+trait $dlistable(Tnode &, Telem &) {
         $mgd_link(Telem) & $prev_link(Tnode &);
         $mgd_link(Telem) & $next_link(Tnode &);
 …
 };
 forall (dtype Tnode, dtype Telem | $dlistable(Tnode, Telem)) {
+forall (Tnode &, Telem & | $dlistable(Tnode, Telem)) {
         // implemented as a sentinel item in an underlying cicrular list

libcfa/src/containers/maybe.cfa

-              r342af53
+              r8e4aa05
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this) {
         this.has_value = false;
+}
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, T value) {
         this.has_value = true;
 …
+}
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, maybe(T) other) {
         this.has_value = other.has_value;
 …
+}
 forall(otype T)
+forall(T)
 maybe(T) ?=?(maybe(T) & this, maybe(T) that) {
         if (this.has_value && that.has_value) {
 …
+}
 forall(otype T)
+forall(T)
 void ^?{}(maybe(T) & this) {
         if (this.has_value) {
 …
+}
 forall(otype T)
+forall(T)
 bool ?!=?(maybe(T) this, zero_t) {
         return this.has_value;
+}
 forall(otype T)
+forall(T)
 maybe(T) maybe_value(T value) {
         return (maybe(T)){value};
+}
 forall(otype T)
+forall(T)
 maybe(T) maybe_none() {
         return (maybe(T)){};
+}
 forall(otype T)
+forall(T)
 bool has_value(maybe(T) * this) {
         return this->has_value;
+}
 forall(otype T)
+forall(T)
 T get(maybe(T) * this) {
         assertf(this->has_value, "attempt to get from maybe without value");
 …
+}
 forall(otype T)
+forall(T)
 void set(maybe(T) * this, T value) {
         if (this->has_value) {
 …
+}
 forall(otype T)
+forall(T)
 void set_none(maybe(T) * this) {
         if (this->has_value) {

libcfa/src/containers/maybe.hfa

-              r342af53
+              r8e4aa05
 // DO NOT USE DIRECTLY!
 forall(otype T)
+forall(T)
 struct maybe {
     bool has_value;
 …
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this);
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, T value);
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, maybe(T) other);
 forall(otype T)
+forall(T)
 void ^?{}(maybe(T) & this);
 forall(otype T)
+forall(T)
 maybe(T) ?=?(maybe(T) & this, maybe(T) other);
 forall(otype T)
+forall(T)
 bool ?!=?(maybe(T) this, zero_t);
 /* Waiting for bug#11 to be fixed.
 forall(otype T)
+forall(T)
 maybe(T) maybe_value(T value);
 forall(otype T)
+forall(T)
 maybe(T) maybe_none();
 */
 forall(otype T)
+forall(T)
 bool has_value(maybe(T) * this);
 forall(otype T)
+forall(T)
 T get(maybe(T) * this);
 forall(otype T)
+forall(T)
 void set(maybe(T) * this, T value);
 forall(otype T)
+forall(T)
 void set_none(maybe(T) * this);

libcfa/src/containers/pair.cfa

-              r342af53
+              r8e4aa05
 #include <containers/pair.hfa>
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?<?(R, R); int ?<?(S, S); })
 int ?<?(pair(R, S) p, pair(R, S) q) {
 …
+}
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?<?(R, R); int ?<=?(S, S); })
 int ?<=?(pair(R, S) p, pair(R, S) q) {
 …
+}
 forall(otype R, otype S | { int ?==?(R, R); int ?==?(S, S); })
+forall(R, S | { int ?==?(R, R); int ?==?(S, S); })
 int ?==?(pair(R, S) p, pair(R, S) q) {
         return p.first == q.first && p.second == q.second;
+}
 forall(otype R, otype S | { int ?!=?(R, R); int ?!=?(S, S); })
+forall(R, S | { int ?!=?(R, R); int ?!=?(S, S); })
 int ?!=?(pair(R, S) p, pair(R, S) q) {
         return p.first != q.first || p.second != q.second;
+}
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?>?(R, R); int ?>?(S, S); })
 int ?>?(pair(R, S) p, pair(R, S) q) {
 …
+}
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?>?(R, R); int ?>=?(S, S); })
 int ?>=?(pair(R, S) p, pair(R, S) q) {

libcfa/src/containers/pair.hfa

-              r342af53
+              r8e4aa05
 #pragma once
 forall(otype R, otype S) struct pair {
+forall(R, S) struct pair {
         R first;
         S second;
 };
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?<?(R, R); int ?<?(S, S); })
 int ?<?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?<?(R, R); int ?<=?(S, S); })
 int ?<=?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S | { int ?==?(R, R); int ?==?(S, S); })
+forall(R, S | { int ?==?(R, R); int ?==?(S, S); })
 int ?==?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S | { int ?!=?(R, R); int ?!=?(S, S); })
+forall(R, S | { int ?!=?(R, R); int ?!=?(S, S); })
 int ?!=?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?>?(R, R); int ?>?(S, S); })
 int ?>?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?>?(R, R); int ?>=?(S, S); })
 int ?>=?(pair(R, S) p, pair(R, S) q);

libcfa/src/containers/result.cfa

-              r342af53
+              r8e4aa05
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this) {
         this.has_value = false;
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, one_t, T value) {
         this.has_value = true;
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, zero_t, E error) {
         this.has_value = false;
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, result(T, E) other) {
         this.has_value = other.has_value;
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) ?=?(result(T, E) & this, result(T, E) that) {
         if (this.has_value && that.has_value) {
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void ^?{}(result(T, E) & this) {
         if (this.has_value) {
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 bool ?!=?(result(T, E) this, zero_t) {
         return this.has_value;
+}
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_value(T value) {
         return (result(T, E)){1, value};
+}
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_error(E error) {
         return (result(T, E)){0, error};
+}
 forall(otype T, otype E)
+forall(T, E)
 bool has_value(result(T, E) * this) {
         return this->has_value;
+}
 forall(otype T, otype E)
+forall(T, E)
 T get(result(T, E) * this) {
         assertf(this->has_value, "attempt to get from result without value");
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 E get_error(result(T, E) * this) {
         assertf(!this->has_value, "attempt to get from result without error");
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void set(result(T, E) * this, T value) {
         if (this->has_value) {
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void set_error(result(T, E) * this, E error) {
         if (this->has_value) {

libcfa/src/containers/result.hfa

-              r342af53
+              r8e4aa05
 // DO NOT USE DIRECTLY!
 forall(otype T, otype E)
+forall(T, E)
 union inner_result{
         T value;
 …
 };
 forall(otype T, otype E)
+forall(T, E)
 struct result {
         bool has_value;
 …
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this);
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, one_t, T value);
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, zero_t, E error);
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, result(T, E) other);
 forall(otype T, otype E)
+forall(T, E)
 void ^?{}(result(T, E) & this);
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) ?=?(result(T, E) & this, result(T, E) other);
 forall(otype T, otype E)
+forall(T, E)
 bool ?!=?(result(T, E) this, zero_t);
 /* Wating for bug#11 to be fixed.
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_value(T value);
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_error(E error);
 */
 forall(otype T, otype E)
+forall(T, E)
 bool has_value(result(T, E) * this);
 forall(otype T, otype E)
+forall(T, E)
 T get(result(T, E) * this);
 forall(otype T, otype E)
+forall(T, E)
 E get_error(result(T, E) * this);
 forall(otype T, otype E)
+forall(T, E)
 void set(result(T, E) * this, T value);
 forall(otype T, otype E)
+forall(T, E)
 void set_error(result(T, E) * this, E error);

libcfa/src/containers/stackLockFree.hfa

-              r342af53
+              r8e4aa05
 // Created On       : Wed May 13 20:58:58 2020
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Jun 14 13:25:09 2020
 // Update Count     : 64
+// Last Modified On : Wed Jan 20 20:40:03 2021
+// Update Count     : 67
 //
 …
 #include <stdint.h>
 forall( dtype T )
+forall( T & )
 union Link {
         struct {                                                                                        // 32/64-bit x 2
 …
 }; // Link
 forall( otype T | sized(T) | { Link(T) * ?`next( T * ); } ) {
+forall( T | sized(T) | { Link(T) * ?`next( T * ); } ) {
         struct StackLF {
                 Link(T) stack;
 …
                 void push( StackLF(T) & this, T & n ) with(this) {
                         *( &n )`next = stack;                                   // atomic assignment unnecessary, or use CAA
+                        *( &n )`next = stack;                                           // atomic assignment unnecessary, or use CAA
                         for () {                                                                        // busy wait
                           if ( __atomic_compare_exchange_n( &stack.atom, &( &n )`next->atom, (Link(T))@{ {&n, ( &n )`next->count + 1} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) break; // attempt to update top node
 …
+                                }
                                 if( next == 0p ) return false;
                                 link = (next)`next;
+                                link = ( next )`next;
+                        }
+                }

libcfa/src/containers/vector.cfa

r342af53	r8e4aa05
18	18	#include <stdlib.hfa>
19	19
20		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	20	forall(T, allocator_t \| allocator_c(T, allocator_t))
21	21	void copy_internal(vector(T, allocator_t)* this, vector(T, allocator_t)* other);
22	22
23	23	//------------------------------------------------------------------------------
24	24	//Initialization
25		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	25	forall(T, allocator_t \| allocator_c(T, allocator_t))
26	26	void ?{}(vector(T, allocator_t)& this)
27	27	{
…	…
30	30	}
31	31
32		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	32	forall(T, allocator_t \| allocator_c(T, allocator_t))
33	33	void ?{}(vector(T, allocator_t)& this, vector(T, allocator_t) rhs)
34	34	{
…	…
37	37	}
38	38
39		// forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	39	// forall(T, allocator_t \| allocator_c(T, allocator_t))
40	40	// vector(T, allocator_t) ?=?(vector(T, allocator_t)* this, vector(T, allocator_t) rhs)
41	41	// {
…	…
45	45	// }
46	46
47		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	47	forall(T, allocator_t \| allocator_c(T, allocator_t))
48	48	void ^?{}(vector(T, allocator_t)& this)
49	49	{
…	…
54	54	//------------------------------------------------------------------------------
55	55	//Modifiers
56		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	56	forall(T, allocator_t \| allocator_c(T, allocator_t))
57	57	void push_back(vector(T, allocator_t)* this, T value)
58	58	{
…	…
62	62	}
63	63
64		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	64	forall(T, allocator_t \| allocator_c(T, allocator_t))
65	65	void pop_back(vector(T, allocator_t)* this)
66	66	{
…	…
69	69	}
70	70
71		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	71	forall(T, allocator_t \| allocator_c(T, allocator_t))
72	72	void clear(vector(T, allocator_t)* this)
73	73	{
…	…
82	82	//Internal Helpers
83	83
84		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	84	forall(T, allocator_t \| allocator_c(T, allocator_t))
85	85	void copy_internal(vector(T, allocator_t)* this, vector(T, allocator_t)* other)
86	86	{
…	…
93	93	//------------------------------------------------------------------------------
94	94	//Allocator
95		forall(~~otype~~ T)
	95	forall(T)
96	96	void ?{}(heap_allocator(T)& this)
97	97	{
…	…
100	100	}
101	101
102		forall(~~otype~~ T)
	102	forall(T)
103	103	void ?{}(heap_allocator(T)& this, heap_allocator(T) rhs)
104	104	{
…	…
107	107	}
108	108
109		forall(~~otype~~ T)
	109	forall(T)
110	110	heap_allocator(T) ?=?(heap_allocator(T)& this, heap_allocator(T) rhs)
111	111	{
…	…
115	115	}
116	116
117		forall(~~otype~~ T)
	117	forall(T)
118	118	void ^?{}(heap_allocator(T)& this)
119	119	{
…	…
121	121	}
122	122
123		forall(~~otype~~ T)
	123	forall(T)
124	124	inline void realloc_storage(heap_allocator(T)* this, size_t size)
125	125	{

libcfa/src/containers/vector.hfa

-              r342af53
+              r8e4aa05
 //------------------------------------------------------------------------------
 //Allocator
 forall(otype T)
+forall(T)
 struct heap_allocator
+{
 …
 };
 forall(otype T)
+forall(T)
 void ?{}(heap_allocator(T)& this);
 forall(otype T)
+forall(T)
 void ?{}(heap_allocator(T)& this, heap_allocator(T) rhs);
 forall(otype T)
+forall(T)
 heap_allocator(T) ?=?(heap_allocator(T)& this, heap_allocator(T) rhs);
 forall(otype T)
+forall(T)
 void ^?{}(heap_allocator(T)& this);
 forall(otype T)
+forall(T)
 void realloc_storage(heap_allocator(T)* this, size_t size);
 forall(otype T)
+forall(T)
 static inline T* data(heap_allocator(T)* this)
+{
 …
 //------------------------------------------------------------------------------
 //Declaration
 trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
+{
         void realloc_storage(allocator_t*, size_t);
 …
 };
 forall(otype T, otype allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
+forall(T, allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
 struct vector;
 //------------------------------------------------------------------------------
 //Initialization
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ?{}(vector(T, allocator_t)& this);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ?{}(vector(T, allocator_t)& this, vector(T, allocator_t) rhs);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 vector(T, allocator_t) ?=?(vector(T, allocator_t)& this, vector(T, allocator_t) rhs);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ^?{}(vector(T, allocator_t)& this);
 forall(otype T, otype allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
+forall(T, allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
 struct vector
+{
 …
 //------------------------------------------------------------------------------
 //Capacity
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline bool empty(vector(T, allocator_t)* this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline size_t size(vector(T, allocator_t)* this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline void reserve(vector(T, allocator_t)* this, size_t size)
+{
 …
 //------------------------------------------------------------------------------
 //Element access
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T at(vector(T, allocator_t)* this, size_t index)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T ?[?](vector(T, allocator_t)* this, size_t index)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T front(vector(T, allocator_t)* this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T back(vector(T, allocator_t)* this)
+{
 …
 //------------------------------------------------------------------------------
 //Modifiers
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void push_back(vector(T, allocator_t)* this, T value);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void pop_back(vector(T, allocator_t)* this);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void clear(vector(T, allocator_t)* this);
 //------------------------------------------------------------------------------
 //Iterators
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T* begin(vector(T, allocator_t)* this)
+{
 …
+}
 // forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+// forall(T, allocator_t | allocator_c(T, allocator_t))
 // static inline const T* cbegin(const vector(T, allocator_t)* this)
 // {
 …
 // }
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T* end(vector(T, allocator_t)* this)
+{
 …
+}
 // forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+// forall(T, allocator_t | allocator_c(T, allocator_t))
 // static inline const T* cend(const vector(T, allocator_t)* this)
 // {

libcfa/src/exception.h

-              r342af53
+              r8e4aa05
 // implemented in the .c file either so they all have to be inline.
 trait is_exception(dtype exceptT, dtype virtualT) {
+trait is_exception(exceptT &, virtualT &) {
         /* The first field must be a pointer to a virtual table.
          * That virtual table must be a decendent of the base exception virtual table.
 …
 };
 trait is_termination_exception(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT)) {
+trait is_termination_exception(exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
         void defaultTerminationHandler(exceptT &);
 };
 trait is_resumption_exception(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT)) {
+trait is_resumption_exception(exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
         void defaultResumptionHandler(exceptT &);
 };
 forall(dtype exceptT, dtype virtualT | is_termination_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_termination_exception(exceptT, virtualT))
 static inline void $throw(exceptT & except) {
         __cfaehm_throw_terminate(
 …
+}
 forall(dtype exceptT, dtype virtualT | is_resumption_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_resumption_exception(exceptT, virtualT))
 static inline void $throwResume(exceptT & except) {
         __cfaehm_throw_resume(
 …
+}
 forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void cancel_stack(exceptT & except) __attribute__((noreturn)) {
         __cfaehm_cancel_stack( (exception_t *)&except );
+}
 forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void defaultTerminationHandler(exceptT & except) {
         return cancel_stack( except );
+}
 forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void defaultResumptionHandler(exceptT & except) {
         throw except;

libcfa/src/executor.cfa

r342af53	r8e4aa05
7	7	#include <containers/list.hfa>
8	8
9		forall( ~~dtype T~~ \| $dlistable(T, T) ) {
	9	forall( T & \| $dlistable(T, T) ) {
10	10	monitor Buffer { // unbounded buffer
11	11	dlist( T, T ) queue; // unbounded list of work requests

libcfa/src/fstream.cfa

-              r342af53
+              r8e4aa05
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Jun 19 16:24:54 2020
 // Update Count     : 384
+// Last Modified On : Mon Mar  1 21:12:15 2021
+// Update Count     : 424
 //
 …
 #include <errno.h>                                                                              // errno
 // *********************************** ofstream ***********************************
 …
         os.$prt = false;
         os.$sawNL = false;
+        os.$acquired = false;
         $sepSetCur( os, sepGet( os ) );
         sepSet( os, " " );
 …
         if ( &os == &exit ) exit( EXIT_FAILURE );
         if ( &os == &abort ) abort();
+        if ( os.$acquired ) { os.$acquired = false; release( os ); }
 } // ends
 …
 } // fmt
+inline void acquire( ofstream & os ) {
+        lock( os.$lock );
+        if ( ! os.$acquired ) os.$acquired = true;
+        else unlock( os.$lock );
+} // acquire
+inline void release( ofstream & os ) {
+        unlock( os.$lock );
+} // release
+void ?{}( osacquire & acq, ofstream & os ) { &acq.os = &os; lock( os.$lock ); }
+void ^?{}( osacquire & acq ) { release( acq.os ); }
 static ofstream soutFile = { (FILE *)stdout };
 ofstream & sout = soutFile, & stdout = soutFile;
 …
 ofstream & serr = serrFile, & stderr = serrFile;
+static ofstream lsoutFile = { (FILE *)stdout };
+ofstream & lsout = lsoutFile;
 static ofstream exitFile = { (FILE *)stdout };
 ofstream & exit = exitFile;
 …
         is.$file = file;
         is.$nlOnOff = false;
+        is.$acquired = false;
 } // ?{}
 …
         return is.$file == 0p || ferror( (FILE *)(is.$file) );
 } // fail
+void ends( ifstream & is ) {
+        if ( is.$acquired ) { is.$acquired = false; release( is ); }
+} // ends
 int eof( ifstream & is ) {
 …
 } // fmt
+inline void acquire( ifstream & is ) {
+        lock( is.$lock );
+        if ( ! is.$acquired ) is.$acquired = true;
+        else unlock( is.$lock );
+} // acquire
+inline void release( ifstream & is ) {
+        unlock( is.$lock );
+} // release
+void ?{}( isacquire & acq, ifstream & is ) { &acq.is = &is; lock( is.$lock ); }
+void ^?{}( isacquire & acq ) { release( acq.is ); }
 static ifstream sinFile = { (FILE *)stdin };
 ifstream & sin = sinFile, & stdin = sinFile;

libcfa/src/fstream.hfa

-              r342af53
+              r8e4aa05
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Jun 19 16:29:17 2020
 // Update Count     : 189
+// Last Modified On : Mon Mar  1 22:45:08 2021
+// Update Count     : 217
 //
 #pragma once
+#include "bits/weakso_locks.hfa"                                                // mutex_lock
 #include "iostream.hfa"
 #include <exception.hfa>
 …
         char $separator[sepSize];
         char $tupleSeparator[sepSize];
+        multiple_acquisition_lock $lock;
+        bool $acquired;
 }; // ofstream
 …
 ofstream & write( ofstream &, const char data[], size_t size );
 int fmt( ofstream &, const char format[], ... ) __attribute__(( format(printf, 2, 3) ));
+void acquire( ofstream & os );
+void release( ofstream & os );
+struct osacquire {
+        ofstream & os;
+};
+void ?{}( osacquire & acq, ofstream & os );
+void ^?{}( osacquire & acq );
 void ?{}( ofstream & os );
 …
         void * $file;
         bool $nlOnOff;
+        multiple_acquisition_lock $lock;
+        bool $acquired;
 }; // ifstream
 …
 void nlOff( ifstream & );
 bool getANL( ifstream & );
+void ends( ifstream & );
 int fail( ifstream & is );
 int eof( ifstream & is );
 …
 ifstream & ungetc( ifstream & is, char c );
 int fmt( ifstream &, const char format[], ... ) __attribute__(( format(scanf, 2, 3) ));
+void acquire( ifstream & is );
+void release( ifstream & is );
+struct isacquire {
+        ifstream & is;
+};
+void ?{}( isacquire & acq, ifstream & is );
+void ^?{}( isacquire & acq );
 void ?{}( ifstream & is );

libcfa/src/gmp.hfa

-              r342af53
+              r8e4aa05
         // I/O
         forall( dtype istype | istream( istype ) )
+        forall( istype & | istream( istype ) )
                 istype & ?|?( istype & is, Int & mp ) {
                 gmp_scanf( "%Zd", &mp );
 …
         } // ?|?
         forall( dtype ostype | ostream( ostype ) ) {
+        forall( ostype & | ostream( ostype ) ) {
                 ostype & ?|?( ostype & os, Int mp ) {
                         if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );

libcfa/src/interpose.cfa

-              r342af53
+              r8e4aa05
                 // Failure handler
+                __cfaabi_sigaction( SIGSEGV, sigHandler_segv, SA_SIGINFO | SA_ONSTACK );
+                __cfaabi_sigaction( SIGBUS , sigHandler_segv, SA_SIGINFO | SA_ONSTACK );
+                __cfaabi_sigaction( SIGILL , sigHandler_ill , SA_SIGINFO | SA_ONSTACK );
+                __cfaabi_sigaction( SIGFPE , sigHandler_fpe , SA_SIGINFO | SA_ONSTACK );
+                __cfaabi_sigaction( SIGTERM, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // one shot handler, return to default
+                __cfaabi_sigaction( SIGINT , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND );
+                __cfaabi_sigaction( SIGABRT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND );
+                __cfaabi_sigaction( SIGHUP , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // terminal hangup
+                 // internal errors
+                __cfaabi_sigaction( SIGSEGV, sigHandler_segv, SA_SIGINFO | SA_ONSTACK ); // Invalid memory reference (default: Core)
+                __cfaabi_sigaction( SIGBUS , sigHandler_segv, SA_SIGINFO | SA_ONSTACK ); // Bus error, bad memory access (default: Core)
+                __cfaabi_sigaction( SIGILL , sigHandler_ill , SA_SIGINFO | SA_ONSTACK ); // Illegal Instruction (default: Core)
+                __cfaabi_sigaction( SIGFPE , sigHandler_fpe , SA_SIGINFO | SA_ONSTACK ); // Floating-point exception (default: Core)
+                // handlers to outside errors
+                // reset in-case they insist and send it over and over
+                __cfaabi_sigaction( SIGTERM, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Termination signal (default: Term)
+                __cfaabi_sigaction( SIGINT , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Interrupt from keyboard (default: Term)
+                __cfaabi_sigaction( SIGHUP , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Hangup detected on controlling terminal or death of controlling process (default: Term)
+                __cfaabi_sigaction( SIGQUIT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Quit from keyboard (default: Core)
+                __cfaabi_sigaction( SIGABRT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Abort signal from abort(3) (default: Core)
+        }
+}
 …
+}
+void * kernel_abort( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 0p; }
 void kernel_abort_msg( void * data, char buffer[], int size ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
+// See concurrency/kernel.cfa for strong definition used in multi-processor mode.
 int kernel_abort_lastframe( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 4; }
+// See concurrency/kernel.cfa and concurrency/preemption.cfa for strong definition used in multi-processor mode.
+void __kernel_abort_lock( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
+void __kernel_abort_msg( char buffer[], int size ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
+int __kernel_abort_lastframe( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 4; }
 enum { abort_text_size = 1024 };
 …
 static void __cfaabi_backtrace( int start ) {
         enum { Frames = 50, };                                                          // maximum number of stack frames
         int last = kernel_abort_lastframe();                            // skip last N stack frames
+        int last = __kernel_abort_lastframe();                          // skip last N stack frames
         void * array[Frames];
 …
+}
 static volatile int __abort_stage = 0;
+static volatile bool __abort_first = 0;
 // Cannot forward va_list.
 void __abort( bool signalAbort, const char fmt[], va_list args ) {
+        int stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+        // First stage: stop the cforall kernel and print
+        if(stage == 1) {
+                // increment stage
+                stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+                // must be done here to lock down kernel
+                void * kernel_data = kernel_abort();
+                int len;
+                signal( SIGABRT, SIG_DFL );                                                     // prevent final "real" abort from recursing to handler
+                len = snprintf( abort_text, abort_text_size, "Cforall Runtime error (UNIX pid:%ld) ", (long int)getpid() ); // use UNIX pid (versus getPid)
+                __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+                assert( fmt );
+                len = vsnprintf( abort_text, abort_text_size, fmt, args );
+                __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+                // add optional newline if missing at the end of the format text
+                if ( fmt[strlen( fmt ) - 1] != '\n' ) {
+                        __cfaabi_bits_write( STDERR_FILENO, "\n", 1 );
+                } // if
+                kernel_abort_msg( kernel_data, abort_text, abort_text_size );
+        }
+        // Second stage: print the backtrace
+        if(stage == 2) {
+                // increment stage
+                stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+                // print stack trace in handler
+                __cfaabi_backtrace( signalAbort ? 4 : 2 );
+        }
+        do {
+                // Finally call abort
+        // Multiple threads can come here from multiple paths
+        // To make sure this is safe any concurrent/subsequent call to abort is redirected to libc-abort
+        bool first = ! __atomic_test_and_set( &__abort_first, __ATOMIC_SEQ_CST);
+        // Prevent preemption from kicking-in and messing with the abort
+        __kernel_abort_lock();
+        // first to abort ?
+        if ( !first ) {
+                // We aren't the first to abort just let C handle it
+                signal( SIGABRT, SIG_DFL );     // restore default in case we came here through the function.
                 __cabi_libc.abort();
+                // Loop so that we never return
+        } while(true);
+        }
+        int len = snprintf( abort_text, abort_text_size, "Cforall Runtime error (UNIX pid:%ld) ", (long int)getpid() ); // use UNIX pid (versus getPid)
+        __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+        // print the cause of the error
+        assert( fmt );
+        len = vsnprintf( abort_text, abort_text_size, fmt, args );
+        __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+        // add optional newline if missing at the end of the format text
+        if ( fmt[strlen( fmt ) - 1] != '\n' ) {
+                __cfaabi_bits_write( STDERR_FILENO, "\n", 1 );
+        } // if
+        // Give the kernel the chance to add some data in here
+        __kernel_abort_msg( abort_text, abort_text_size );
+        // print stack trace in handler
+        __cfaabi_backtrace( signalAbort ? 4 : 2 );
+        // Finally call abort
+        __cabi_libc.abort();
+}

libcfa/src/iostream.cfa

-              r342af53
+              r8e4aa05
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Aug 24 08:31:35 2020
 // Update Count     : 1130
+// Last Modified On : Tue Mar  2 14:51:30 2021
+// Update Count     : 1151
 //
 …
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, bool b ) {
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
 …
         } // ?|?
         ostype & ?|?( ostype & os, const char str[] ) {
+        ostype & ?|?( ostype & os, const char s[] ) {
                 enum { Open = 1, Close, OpenClose };
                 static const unsigned char mask[256] @= {
 …
                 }; // mask
           if ( str[0] == '\0' ) { sepOff( os ); return os; } // null string => no separator
+          if ( s[0] == '\0' ) { sepOff( os ); return os; } // null string => no separator
                 // first character IS NOT spacing or closing punctuation => add left separator
                 unsigned char ch = str[0];                                              // must make unsigned
+                unsigned char ch = s[0];                                                // must make unsigned
                 if ( $sepPrt( os ) && mask[ ch ] != Close && mask[ ch ] != OpenClose ) {
                         fmt( os, "%s", $sepGetCur( os ) );
 …
                 // last character IS spacing or opening punctuation => turn off separator for next item
                 size_t len = strlen( str );
                 ch = str[len - 1];                                                              // must make unsigned
+                size_t len = strlen( s );
+                ch = s[len - 1];                                                                // must make unsigned
                 if ( $sepPrt( os ) && mask[ ch ] != Open && mask[ ch ] != OpenClose ) {
                         sepOn( os );
 …
                 } // if
                 if ( ch == '\n' ) $setNL( os, true );                   // check *AFTER* $sepPrt call above as it resets NL flag
+                return write( os, str, len );
+        } // ?|?
+        void ?|?( ostype & os, const char str[] ) {
+                (ostype &)(os | str); ends( os );
+        } // ?|?
+//      ostype & ?|?( ostype & os, const char16_t * str ) {
+                return write( os, s, len );
+        } // ?|?
+        void ?|?( ostype & os, const char s[] ) {
+                (ostype &)(os | s); ends( os );
+        } // ?|?
+//      ostype & ?|?( ostype & os, const char16_t * s ) {
 //              if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
 //              fmt( os, "%ls", str );
+//              fmt( os, "%ls", s );
 //              return os;
 //      } // ?|?
 // #if ! ( __ARM_ARCH_ISA_ARM == 1 && __ARM_32BIT_STATE == 1 ) // char32_t == wchar_t => ambiguous
 //      ostype & ?|?( ostype & os, const char32_t * str ) {
+//      ostype & ?|?( ostype & os, const char32_t * s ) {
 //              if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
 //              fmt( os, "%ls", str );
+//              fmt( os, "%ls", s );
 //              return os;
 //      } // ?|?
 // #endif // ! ( __ARM_ARCH_ISA_ARM == 1 && __ARM_32BIT_STATE == 1 )
 //      ostype & ?|?( ostype & os, const wchar_t * str ) {
+//      ostype & ?|?( ostype & os, const wchar_t * s ) {
 //              if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
 //              fmt( os, "%ls", str );
+//              fmt( os, "%ls", s );
 //              return os;
 //      } // ?|?
 …
         // manipulators
         ostype & ?|?( ostype & os, ostype & (* manip)( ostype & ) ) {
+                (ostype &)(manip( os ));
+                return os;
+                return manip( os );
         } // ?|?
         void ?|?( ostype & os, ostype & (* manip)( ostype & ) ) {
                 (ostype &)(manip( os ));
+                manip( os );
                 if ( $getPrt( os ) ) ends( os );                                // something printed ?
                 $setPrt( os, false );                                                   // turn off
 …
                 return os;
         } // nlOff
+        ostype & acquire( ostype & os ) {
+                acquire( os );                                                                  // call void returning
+                return os;
+        } // acquire
 } // distribution
 // tuples
 forall( dtype ostype, otype T, ttype Params | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
+forall( ostype &, T, Params... | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
         ostype & ?|?( ostype & os, T arg, Params rest ) {
                 (ostype &)(os | arg);                                                   // print first argument
 …
 // writes the range [begin, end) to the given stream
 forall( dtype ostype, otype elt_type | writeable( elt_type, ostype ), otype iterator_type | iterator( iterator_type, elt_type ) ) {
+forall( ostype &, elt_type | writeable( elt_type, ostype ), iterator_type | iterator( iterator_type, elt_type ) ) {
         void write( iterator_type begin, iterator_type end, ostype & os ) {
                 void print( elt_type i ) { os | i; }
 …
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 #define IntegralFMTImpl( T, IFMTNP, IFMTP ) \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
 …
                 return os; \
         } /* ?|? */ \
+        void ?|?( ostype & os, _Ostream_Manip(T) f ) { (ostype &)(os | f); ends( os ); } \
+        void ?|?( ostype & os, _Ostream_Manip(T) f ) { \
+                (ostype &)(os | f); ends( os ); \
+        } /* ?|? */ \
 } // distribution
 …
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 #define IntegralFMTImpl128( T, SIGNED, CODE, IFMTNP, IFMTP ) \
 forall( dtype ostype | ostream( ostype ) ) \
+forall( ostype & | ostream( ostype ) ) \
 static void base10_128( ostype & os, _Ostream_Manip(T) f ) { \
         if ( f.val > UINT64_MAX ) { \
 …
         } /* if */ \
 } /* base10_128 */ \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
 …
 #if defined( __SIZEOF_INT128__ )
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 forall( dtype ostype | ostream( ostype ) )
+forall( ostype & | ostream( ostype ) )
 static inline void base_128( ostype & os, unsigned int128 val, unsigned int128 power, _Ostream_Manip(uint64_t) & f, unsigned int maxdig, unsigned int bits, unsigned int cnt = 0 ) {
         int wd = 1;                                                                                     // f.wd is never 0 because 0 implies left-pad
 …
 #define IntegralFMTImpl128( T ) \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 _Ostream_Manip(uint64_t) fmt; \
 …
 #define FloatingPointFMTImpl( T, DFMTNP, DFMTP ) \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
 …
 // *********************************** character ***********************************
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, _Ostream_Manip(char) f ) {
                 if ( f.base != 'c' ) {                                                  // bespoke binary/octal/hex format
 …
 // *********************************** C string ***********************************
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, _Ostream_Manip(const char *) f ) {
                 if ( ! f.val ) return os;                                               // null pointer ?
 …
 forall( dtype istype | istream( istype ) ) {
+forall( istype & | istream( istype ) ) {
         istype & ?|?( istype & is, bool & b ) {
                 char val[6];
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, bool & b ) {
+                (istype &)(is | b); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, char & c ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, char & c ) {
+                (istype &)(is | c); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, signed char & sc ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, signed char & sc ) {
+                (istype &)(is | sc); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, unsigned char & usc ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, unsigned char & usc ) {
+                (istype &)(is | usc); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, short int & si ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, short int & si ) {
+                (istype &)(is | si); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, unsigned short int & usi ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, unsigned short int & usi ) {
+                (istype &)(is | usi); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, int & i ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, int & i ) {
+                (istype &)(is | i); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, unsigned int & ui ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, unsigned int & ui ) {
+                (istype &)(is | ui); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, long int & li ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, long int & li ) {
+                (istype &)(is | li); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, unsigned long int & ulli ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, unsigned long int & ulli ) {
+                (istype &)(is | ulli); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, long long int & lli ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, long long int & lli ) {
+                (istype &)(is | lli); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, unsigned long long int & ulli ) {
 …
                 return is;
         } // ?|?
+        void & ?|?( istype & is, unsigned long long int & ulli ) {
+                (istype &)(is | ulli); ends( is );
+        } // ?|?
 #if defined( __SIZEOF_INT128__ )
+        istype & ?|?( istype & is, int128 & i128 ) {
+                return (istype &)(is | (unsigned int128 &)i128);
+        } // ?|?
+        istype & ?|?( istype & is, unsigned int128 & ui128 ) {
+        istype & ?|?( istype & is, int128 & llli ) {
+                return (istype &)(is | (unsigned int128 &)llli);
+        } // ?|?
+        void ?|?( istype & is, int128 & llli ) {
+                (istype &)(is | llli); ends( is );
+        } // ?|?
+        istype & ?|?( istype & is, unsigned int128 & ullli ) {
                 char s[40];
                 bool sign = false;
 …
                 // If the input is too large, the value returned is undefined. If there is no input, no value is returned
                 if ( fmt( is, "%39[0-9]%*[0-9]", s ) == 1 ) {   // take first 39 characters, ignore remaining
                         ui128 = 0;
+                        ullli = 0;
                         for ( unsigned int i = 0; s[i] != '\0'; i += 1 ) {
                                 ui128 = ui128 * 10 + s[i] - '0';
+                                ullli = ullli * 10 + s[i] - '0';
                         } // for
                         if ( sign ) ui128 = -ui128;
+                        if ( sign ) ullli = -ullli;
                 } else if ( sign ) ungetc( is, '-' );                   // return minus when no digits
                 return is;
+        } // ?|?
+        void ?|?( istype & is, unsigned int128 & ullli ) {
+                (istype &)(is | ullli); ends( is );
         } // ?|?
 #endif // __SIZEOF_INT128__
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, float & f ) {
+                (istype &)(is | f); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, double & d ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, double & d ) {
+                (istype &)(is | d); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, long double & ld ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, long double & ld ) {
+                (istype &)(is | ld); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, float _Complex & fc ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, float _Complex & fc ) {
+                (istype &)(is | fc); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, double _Complex & dc ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, double _Complex & dc ) {
+                (istype &)(is | dc); ends( is );
+        } // ?|?
         istype & ?|?( istype & is, long double _Complex & ldc ) {
 …
                 return is;
         } // ?|?
+        void ?|?( istype & is, long double _Complex & ldc ) {
+                (istype &)(is | ldc); ends( is );
+        } // ?|?
         // istype & ?|?( istype & is, const char fmt[] ) {
 …
         // } // ?|?
         istype & ?|?( istype & is, char * s ) {
+        istype & ?|?( istype & is, char s[] ) {
                 fmt( is, "%s", s );
                 return is;
+        } // ?|?
+        void ?|?( istype & is, char s[] ) {
+                (istype &)(is | s); ends( is );
         } // ?|?
 …
                 return manip( is );
         } // ?|?
+        void ?|?( istype & is, istype & (* manip)( istype & ) ) {
+                manip( is ); ends( is );
+        } // ?|?
         istype & nl( istype & is ) {
 …
                 return is;
         } // nlOff
+        istype & acquire( istype & is ) {
+                acquire( is );                                                                  // call void returning
+                return is;
+        } // acquire
 } // distribution
 // *********************************** manipulators ***********************************
+forall( dtype istype | istream( istype ) )
+istype & ?|?( istype & is, _Istream_Cstr f ) {
+        // skip xxx
+        if ( ! f.s ) {
+                // printf( "skip %s %d\n", f.scanset, f.wd );
+                if ( f.wd == -1 ) fmt( is, f.scanset, "" );             // no input arguments
+                else for ( f.wd ) fmt( is, "%*c" );
+                return is;
+        } // if
+        size_t len = 0;
+        if ( f.scanset ) len = strlen( f.scanset );
+        char fmtstr[len + 16];
+        int start = 1;
+        fmtstr[0] = '%';
+        if ( f.flags.ignore ) { fmtstr[1] = '*'; start += 1; }
+        if ( f.wd != -1 ) { start += sprintf( &fmtstr[start], "%d", f.wd ); }
+        // cstr %s, %*s, %ws, %*ws
+        if ( ! f.scanset ) {
+                fmtstr[start] = 's'; fmtstr[start + 1] = '\0';
+                // printf( "cstr %s\n", fmtstr );
+forall( istype & | istream( istype ) ) {
+        istype & ?|?( istype & is, _Istream_Cstr f ) {
+                // skip xxx
+                if ( ! f.s ) {
+                        // printf( "skip %s %d\n", f.scanset, f.wd );
+                        if ( f.wd == -1 ) fmt( is, f.scanset, "" );             // no input arguments
+                        else for ( f.wd ) fmt( is, "%*c" );
+                        return is;
+                } // if
+                size_t len = 0;
+                if ( f.scanset ) len = strlen( f.scanset );
+                char fmtstr[len + 16];
+                int start = 1;
+                fmtstr[0] = '%';
+                if ( f.flags.ignore ) { fmtstr[1] = '*'; start += 1; }
+                if ( f.wd != -1 ) { start += sprintf( &fmtstr[start], "%d", f.wd ); }
+                // cstr %s, %*s, %ws, %*ws
+                if ( ! f.scanset ) {
+                        fmtstr[start] = 's'; fmtstr[start + 1] = '\0';
+                        // printf( "cstr %s\n", fmtstr );
+                        fmt( is, fmtstr, f.s );
+                        return is;
+                } // if
+                // incl %[xxx],  %*[xxx],  %w[xxx],  %*w[xxx]
+                // excl %[^xxx], %*[^xxx], %w[^xxx], %*w[^xxx]
+                fmtstr[start] = '['; start += 1;
+                if ( f.flags.inex ) { fmtstr[start] = '^'; start += 1; }
+                strcpy( &fmtstr[start], f.scanset );                            // copy includes '\0'
+                len += start;
+                fmtstr[len] = ']'; fmtstr[len + 1] = '\0';
+                // printf( "incl/excl %s\n", fmtstr );
                 fmt( is, fmtstr, f.s );
                 return is;
+        } // if
+        // incl %[xxx],  %*[xxx],  %w[xxx],  %*w[xxx]
+        // excl %[^xxx], %*[^xxx], %w[^xxx], %*w[^xxx]
+        fmtstr[start] = '['; start += 1;
+        if ( f.flags.inex ) { fmtstr[start] = '^'; start += 1; }
+        strcpy( &fmtstr[start], f.scanset );                            // copy includes '\0'
+        len += start;
+        fmtstr[len] = ']'; fmtstr[len + 1] = '\0';
+        // printf( "incl/excl %s\n", fmtstr );
+        fmt( is, fmtstr, f.s );
+        return is;
+} // ?|?
+forall( dtype istype | istream( istype ) )
+istype & ?|?( istype & is, _Istream_Char f ) {
+        fmt( is, "%*c" );                                                                       // argument variable unused
+        return is;
+} // ?|?
+        } // ?|?
+        void ?|?( istype & is, _Istream_Cstr f ) {
+                (istype &)(is | f); ends( is );
+        } // ?|?
+        istype & ?|?( istype & is, _Istream_Char f ) {
+                fmt( is, "%*c" );                                                                       // argument variable unused
+                return is;
+        } // ?|?
+        void ?|?( istype & is, _Istream_Char f ) {
+                (istype &)(is | f); ends( is );
+        } // ?|?
+} // distribution
 #define InputFMTImpl( T, CODE ) \
+forall( dtype istype | istream( istype ) ) \
+istype & ?|?( istype & is, _Istream_Manip(T) f ) { \
+        enum { size = 16 }; \
+        char fmtstr[size]; \
+        if ( f.wd == -1 ) { \
+                snprintf( fmtstr, size, "%%%s%s", f.ignore ? "*" : "", CODE ); \
+        } else { \
+                snprintf( fmtstr, size, "%%%s%d%s", f.ignore ? "*" : "", f.wd, CODE ); \
+        } /* if */ \
+        /* printf( "%d %s %p\n", f.wd, fmtstr, &f.val ); */ \
+        fmt( is, fmtstr, &f.val ); \
+        return is; \
+} // ?|?
+forall( istype & | istream( istype ) ) { \
+        istype & ?|?( istype & is, _Istream_Manip(T) f ) { \
+                enum { size = 16 }; \
+                char fmtstr[size]; \
+                if ( f.wd == -1 ) { \
+                        snprintf( fmtstr, size, "%%%s%s", f.ignore ? "*" : "", CODE ); \
+                } else { \
+                        snprintf( fmtstr, size, "%%%s%d%s", f.ignore ? "*" : "", f.wd, CODE ); \
+                } /* if */ \
+                /* printf( "%d %s %p\n", f.wd, fmtstr, &f.val ); */ \
+                fmt( is, fmtstr, &f.val ); \
+                return is; \
+        } /* ?|? */ \
+        void ?|?( istype & is, _Istream_Manip(T) f ) { \
+                (istype &)(is | f); ends( is ); \
+        } /* ?|? */ \
+} // distribution
 InputFMTImpl( signed char, "hhi" )
 …
 InputFMTImpl( long double, "Lf" )
+forall( dtype istype | istream( istype ) )
+istype & ?|?( istype & is, _Istream_Manip(float _Complex) fc ) {
+        float re, im;
+        _Istream_Manip(float) fmtuc @= { re, fc.wd, fc.ignore };
+        is | fmtuc;
+        &fmtuc.val = &im;
+        is | fmtuc;
+        if ( ! fc.ignore ) fc.val = re + im * _Complex_I;       // re/im are uninitialized for ignore
+        return is;
+} // ?|?
+forall( dtype istype | istream( istype ) )
+istype & ?|?( istype & is, _Istream_Manip(double _Complex) dc ) {
+        double re, im;
+        _Istream_Manip(double) fmtuc @= { re, dc.wd, dc.ignore };
+        is | fmtuc;
+        &fmtuc.val = &im;
+        is | fmtuc;
+        if ( ! dc.ignore ) dc.val = re + im * _Complex_I;       // re/im are uninitialized for ignore
+        return is;
+} // ?|?
+forall( dtype istype | istream( istype ) )
+istype & ?|?( istype & is, _Istream_Manip(long double _Complex) ldc ) {
+        long double re, im;
+        _Istream_Manip(long double) fmtuc @= { re, ldc.wd, ldc.ignore };
+        is | fmtuc;
+        &fmtuc.val = &im;
+        is | fmtuc;
+        if ( ! ldc.ignore ) ldc.val = re + im * _Complex_I;     // re/im are uninitialized for ignore
+        return is;
+} // ?|?
+forall( istype & | istream( istype ) ) {
+        istype & ?|?( istype & is, _Istream_Manip(float _Complex) fc ) {
+                float re, im;
+                _Istream_Manip(float) fmtuc @= { re, fc.wd, fc.ignore };
+                is | fmtuc;
+                &fmtuc.val = &im;
+                is | fmtuc;
+                if ( ! fc.ignore ) fc.val = re + im * _Complex_I; // re/im are uninitialized for ignore
+                return is;
+        } // ?|?
+        void ?|?( istype & is, _Istream_Manip(float _Complex) fc ) {
+                (istype &)(is | fc); ends( is );
+        } // ?|?
+        istype & ?|?( istype & is, _Istream_Manip(double _Complex) dc ) {
+                double re, im;
+                _Istream_Manip(double) fmtuc @= { re, dc.wd, dc.ignore };
+                is | fmtuc;
+                &fmtuc.val = &im;
+                is | fmtuc;
+                if ( ! dc.ignore ) dc.val = re + im * _Complex_I; // re/im are uninitialized for ignore
+                return is;
+        } // ?|?
+        void ?|?( istype & is, _Istream_Manip(double _Complex) dc ) {
+                (istype &)(is | dc); ends( is );
+        } // ?|?
+        istype & ?|?( istype & is, _Istream_Manip(long double _Complex) ldc ) {
+                long double re, im;
+                _Istream_Manip(long double) fmtuc @= { re, ldc.wd, ldc.ignore };
+                is | fmtuc;
+                &fmtuc.val = &im;
+                is | fmtuc;
+                if ( ! ldc.ignore ) ldc.val = re + im * _Complex_I;     // re/im are uninitialized for ignore
+                return is;
+        } // ?|?
+        void ?|?( istype & is, _Istream_Manip(long double _Complex) ldc ) {
+                (istype &)(is | ldc); ends( is );
+        } // ?|?
+} // distribution
 // Local Variables: //

libcfa/src/iostream.hfa

-              r342af53
+              r8e4aa05
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Aug 11 22:16:14 2020
 // Update Count     : 350
+// Last Modified On : Tue Mar  2 14:05:08 2021
+// Update Count     : 369
 //
 …
 trait ostream( dtype ostype ) {
+trait ostream( ostype & ) {
         // private
         bool $sepPrt( ostype & );                                                       // get separator state (on/off)
 …
         ostype & write( ostype &, const char [], size_t );
         int fmt( ostype &, const char format[], ... ) __attribute__(( format(printf, 2, 3) ));
+        void acquire( ostype & );
 }; // ostream
 // trait writeable( otype T ) {
 //      forall( dtype ostype | ostream( ostype ) ) ostype & ?|?( ostype &, T );
+// trait writeable( T ) {
+//      forall( ostype & | ostream( ostype ) ) ostype & ?|?( ostype &, T );
 // }; // writeable
 trait writeable( otype T, dtype ostype | ostream( ostype ) ) {
+trait writeable( T, ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype &, T );
 }; // writeable
 …
 // implement writable for intrinsic types
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype &, bool );
         void ?|?( ostype &, bool );
 …
         ostype & nlOn( ostype & );
         ostype & nlOff( ostype & );
+        ostype & acquire( ostype & );
 } // distribution
 // tuples
 forall( dtype ostype, otype T, ttype Params | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
+forall( ostype &, T, Params... | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
         ostype & ?|?( ostype & os, T arg, Params rest );
         void ?|?( ostype & os, T arg, Params rest );
 …
 // writes the range [begin, end) to the given stream
 forall( dtype ostype, otype elt_type | writeable( elt_type, ostype ), otype iterator_type | iterator( iterator_type, elt_type ) ) {
+forall( ostype &, elt_type | writeable( elt_type, ostype ), iterator_type | iterator( iterator_type, elt_type ) ) {
         void write( iterator_type begin, iterator_type end, ostype & os );
         void write_reverse( iterator_type begin, iterator_type end, ostype & os );
 …
 // *********************************** manipulators ***********************************
 forall( otype T )
+forall( T )
 struct _Ostream_Manip {
         T val;                                                                                          // polymorphic base-type
 …
         _Ostream_Manip(T) & sign( _Ostream_Manip(T) & fmt ) { fmt.flags.sign = true; return fmt; } \
 } /* distribution */ \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ); \
         void ?|?( ostype & os, _Ostream_Manip(T) f ); \
 …
         _Ostream_Manip(T) & nodp( _Ostream_Manip(T) & fmt ) { fmt.flags.nobsdp = true; return fmt; } \
 } /* distribution */ \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ); \
         void ?|?( ostype & os, _Ostream_Manip(T) f ); \
 …
         _Ostream_Manip(char) & nobase( _Ostream_Manip(char) & fmt ) { fmt.flags.nobsdp = true; return fmt; }
 } // distribution
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, _Ostream_Manip(char) f );
         void ?|?( ostype & os, _Ostream_Manip(char) f );
 …
         _Ostream_Manip(const char *) & nobase( _Ostream_Manip(const char *) & fmt ) { fmt.flags.nobsdp = true; return fmt; }
 } // distribution
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, _Ostream_Manip(const char *) f );
         void ?|?( ostype & os, _Ostream_Manip(const char *) f );
 …
 trait istream( dtype istype ) {
+trait istream( istype & ) {
         void nlOn( istype & );                                                          // read newline
         void nlOff( istype & );                                                         // scan newline
         bool getANL( istype & );                                                        // get scan newline (on/off)
+        void ends( istype & os );                                                       // end of output statement
         int fail( istype & );
         int eof( istype & );
 …
         istype & ungetc( istype &, char );
         int fmt( istype &, const char format[], ... ) __attribute__(( format(scanf, 2, 3) ));
+        void acquire( istype & );
 }; // istream
 trait readable( otype T ) {
         forall( dtype istype | istream( istype ) ) istype & ?|?( istype &, T );
+trait readable( T ) {
+        forall( istype & | istream( istype ) ) istype & ?|?( istype &, T );
 }; // readable
 forall( dtype istype | istream( istype ) ) {
+forall( istype & | istream( istype ) ) {
         istype & ?|?( istype &, bool & );
+        void ?|?( istype &, bool & );
         istype & ?|?( istype &, char & );
+        void ?|?( istype &, char & );
         istype & ?|?( istype &, signed char & );
+        void ?|?( istype &, signed char & );
         istype & ?|?( istype &, unsigned char & );
+        void ?|?( istype &, unsigned char & );
         istype & ?|?( istype &, short int & );
+        void ?|?( istype &, short int & );
         istype & ?|?( istype &, unsigned short int & );
+        void ?|?( istype &, unsigned short int & );
         istype & ?|?( istype &, int & );
+        void ?|?( istype &, int & );
         istype & ?|?( istype &, unsigned int & );
+        void ?|?( istype &, unsigned int & );
         istype & ?|?( istype &, long int & );
+        void ?|?( istype &, long int & );
         istype & ?|?( istype &, unsigned long int & );
+        void ?|?( istype &, unsigned long int & );
         istype & ?|?( istype &, long long int & );
+        void ?|?( istype &, long long int & );
         istype & ?|?( istype &, unsigned long long int & );
+        void ?|?( istype &, unsigned long long int & );
 #if defined( __SIZEOF_INT128__ )
         istype & ?|?( istype &, int128 & );
+        void ?|?( istype &, int128 & );
         istype & ?|?( istype &, unsigned int128 & );
+        void ?|?( istype &, unsigned int128 & );
 #endif // __SIZEOF_INT128__
         istype & ?|?( istype &, float & );
+        void ?|?( istype &, float & );
         istype & ?|?( istype &, double & );
+        void ?|?( istype &, double & );
         istype & ?|?( istype &, long double & );
+        void ?|?( istype &, long double & );
         istype & ?|?( istype &, float _Complex & );
+        void ?|?( istype &, float _Complex & );
         istype & ?|?( istype &, double _Complex & );
+        void ?|?( istype &, double _Complex & );
         istype & ?|?( istype &, long double _Complex & );
+        void ?|?( istype &, long double _Complex & );
 //      istype & ?|?( istype &, const char [] );
+        istype & ?|?( istype &, char * );
+        istype & ?|?( istype &, char [] );
+        void ?|?( istype &, char [] );
         // manipulators
         istype & ?|?( istype &, istype & (*)( istype & ) );
+        void ?|?( istype &, istype & (*)( istype & ) );
         istype & nl( istype & is );
         istype & nlOn( istype & );
         istype & nlOff( istype & );
+        istype & acquire( istype & );
 } // distribution
 …
         _Istream_Cstr & wdi( unsigned int w, _Istream_Cstr & fmt ) { fmt.wd = w; return fmt; }
 } // distribution
+forall( dtype istype | istream( istype ) ) istype & ?|?( istype & is, _Istream_Cstr f );
+forall( istype & | istream( istype ) ) {
+        istype & ?|?( istype & is, _Istream_Cstr f );
+        void ?|?( istype & is, _Istream_Cstr f );
+}
 struct _Istream_Char {
 …
         _Istream_Char & ignore( _Istream_Char & fmt ) { fmt.ignore = true; return fmt; }
 } // distribution
+forall( dtype istype | istream( istype ) ) istype & ?|?( istype & is, _Istream_Char f );
+forall( dtype T | sized( T ) )
+forall( istype & | istream( istype ) ) {
+        istype & ?|?( istype & is, _Istream_Char f );
+        void ?|?( istype & is, _Istream_Char f );
+}
+forall( T & | sized( T ) )
 struct _Istream_Manip {
         T & val;                                                                                        // polymorphic base-type
 …
         _Istream_Manip(T) & wdi( unsigned int w, _Istream_Manip(T) & fmt ) { fmt.wd = w; return fmt; } \
 } /* distribution */ \
 forall( dtype istype | istream( istype ) ) { \
+forall( istype & | istream( istype ) ) { \
         istype & ?|?( istype & is, _Istream_Manip(T) f ); \
+        void ?|?( istype & is, _Istream_Manip(T) f ); \
 } // ?|?
 …
 #include <time_t.hfa>                                                                   // Duration (constructors) / Time (constructors)
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, Duration dur );
         void ?|?( ostype & os, Duration dur );

libcfa/src/iterator.cfa

-              r342af53
+              r8e4aa05
 #include "iterator.hfa"
 forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each( iterator_type begin, iterator_type end, void (* func)( elt_type ) ) {
         for ( iterator_type i = begin; i != end; ++i ) {
 …
 } // for_each
 forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each_reverse( iterator_type begin, iterator_type end, void (* func)( elt_type ) ) {
         for ( iterator_type i = end; i != begin; ) {

libcfa/src/iterator.hfa

-              r342af53
+              r8e4aa05
 // An iterator can be used to traverse a data structure.
 trait iterator( otype iterator_type, otype elt_type ) {
+trait iterator( iterator_type, elt_type ) {
         // point to the next element
 //      iterator_type ?++( iterator_type & );
 …
 };
 trait iterator_for( otype iterator_type, otype collection_type, otype elt_type | iterator( iterator_type, elt_type ) ) {
+trait iterator_for( iterator_type, collection_type, elt_type | iterator( iterator_type, elt_type ) ) {
 //      [ iterator_type begin, iterator_type end ] get_iterators( collection_type );
         iterator_type begin( collection_type );
 …
 };
 forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each( iterator_type begin, iterator_type end, void (* func)( elt_type ) );
 forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each_reverse( iterator_type begin, iterator_type end, void (* func)( elt_type ) );

libcfa/src/math.hfa

-              r342af53
+              r8e4aa05
         unsigned long long int floor( unsigned long long int n, unsigned long long int align ) { return n / align * align; }
         // forall( otype T | { T ?/?( T, T ); T ?*?( T, T ); } )
+        // forall( T | { T ?/?( T, T ); T ?*?( T, T ); } )
         // T floor( T n, T align ) { return n / align * align; }
 …
         unsigned long long int ceiling_div( unsigned long long int n, unsigned long long int align ) { return (n + (align - 1)) / align; }
         // forall( otype T | { T ?+?( T, T ); T ?-?( T, T ); T ?%?( T, T ); } )
+        // forall( T | { T ?+?( T, T ); T ?-?( T, T ); T ?%?( T, T ); } )
         // T ceiling_div( T n, T align ) { verify( is_pow2( align ) );return (n + (align - 1)) / align; }
 …
         unsigned long long int ceiling( unsigned long long int n, unsigned long long int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
         // forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T ); T ?/?( T, T ); } )
+        // forall( T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T ); T ?/?( T, T ); } )
         // T ceiling( T n, T align ) { return return floor( n + (n % align != 0 ? align - 1 : 0), align ); *}
 …
 static inline {
         forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T );T ?*?( T, T ); } )
+        forall( T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T );T ?*?( T, T ); } )
         T lerp( T x, T y, T a ) { return x * ((T){1} - a) + y * a; }
         forall( otype T | { void ?{}( T &, zero_t ); void ?{}( T &, one_t ); int ?<?( T, T ); } )
+        forall( T | { void ?{}( T &, zero_t ); void ?{}( T &, one_t ); int ?<?( T, T ); } )
         T step( T edge, T x ) { return x < edge ? (T){0} : (T){1}; }
         forall( otype T | { void ?{}( T &, int ); T clamp( T, T, T ); T ?-?( T, T ); T ?*?( T, T ); T ?/?( T, T ); } )
+        forall( T | { void ?{}( T &, int ); T clamp( T, T, T ); T ?-?( T, T ); T ?*?( T, T ); T ?/?( T, T ); } )
         T smoothstep( T edge0, T edge1, T x ) { T t = clamp( (x - edge0) / (edge1 - edge0), (T){0}, (T){1} ); return t * t * ((T){3} - (T){2} * t); }
 } // distribution

libcfa/src/memory.cfa

-              r342af53
+              r8e4aa05
 // Created On       : Tue Jun  2 16:48:00 2020
 // Last Modified By : Andrew Beach
 // Last Modified On : Tue Jun  3 12:30:00 2020
 // Update Count     : 0
+// Last Modified On : Mon Feb  1 16:10:00 2021
+// Update Count     : 1
 //
 …
 // Internal data object.
 forall(dtype T | sized(T), ttype Args | { void ?{}(T &, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T &, Args); })
 void ?{}(counter_data(T) & this, Args args) {
         (this.counter){1};
 …
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ^?{}(counter_data(T) & this) {
         assert(0 == this.counter);
 …
 // This is one of many pointers keeping this alive.
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 void ?{}(counter_ptr(T) & this) {
         this.data = 0p;
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 void ?{}(counter_ptr(T) & this, zero_t) {
         this.data = 0p;
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 static void internal_decrement(counter_ptr(T) & this) {
         if (this.data && 0 == --this.data->counter) {
 …
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 static void internal_copy(counter_ptr(T) & this, counter_ptr(T) & that) {
         this.data = that.data;
 …
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T))
 void ?{}(counter_ptr(T) & this, counter_ptr(T) that) {
         // `that` is a copy but it should have neither a constructor
         // nor destructor run on it so it shouldn't need adjustment.
-        internal_decrement(this);
         internal_copy(this, that);
+}
 forall(dtype T | sized(T), ttype Args | { void ?{}(T&, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T&, Args); })
 void ?{}(counter_ptr(T) & this, Args args) {
+        this.data = (counter_data(T)*)new(args);
+        this.data = malloc();
+        this.data->counter = 1;
+        (this.data->object){args};
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ^?{}(counter_ptr(T) & this) {
         internal_decrement(this);
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 T & *?(counter_ptr(T) & this) {
         return *((this.data) ? &this.data->object : 0p);
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ?=?(counter_ptr(T) & this, counter_ptr(T) that) {
         if (this.data != that.data) {
 …
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ?=?(counter_ptr(T) & this, zero_t) {
         internal_decrement(this);
 …
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?==?(counter_ptr(T) const & this, counter_ptr(T) const & that) {
         return this.data == that.data;
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?!=?(counter_ptr(T) const & this, counter_ptr(T) const & that) {
         return !?==?(this, that);
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?==?(counter_ptr(T) const & this, zero_t) {
         return this.data == 0;
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?!=?(counter_ptr(T) const & this, zero_t) {
         return !?==?(this, (zero_t)0);
 …
 // This is the only pointer that keeps this alive.
 forall(dtype T)
+forall(T &)
 void ?{}(unique_ptr(T) & this) {
         this.data = 0p;
+}
 forall(dtype T)
+forall(T &)
 void ?{}(unique_ptr(T) & this, zero_t) {
         this.data = 0p;
+}
 forall(dtype T | sized(T), ttype Args | { void ?{}(T &, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T &, Args); })
 void ?{}(unique_ptr(T) & this, Args args) {
+        this.data = (T *)new(args);
+        this.data = malloc();
+        (*this.data){args};
+}
 forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void ^?{}(unique_ptr(T) & this) {
         delete(this.data);
+}
 forall(dtype T)
+forall(T &)
 T & *?(unique_ptr(T) & this) {
         return *this.data;
+}
 forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void ?=?(unique_ptr(T) & this, zero_t) {
         delete(this.data);
 …
+}
 forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void move(unique_ptr(T) & this, unique_ptr(T) & that) {
         delete(this.data);
 …
+}
 forall(dtype T)
+forall(T &)
 int ?==?(unique_ptr(T) const & this, unique_ptr(T) const & that) {
         return this.data == that.data;
+}
 forall(dtype T)
+forall(T &)
 int ?!=?(unique_ptr(T) const & this, unique_ptr(T) const & that) {
         return !?==?(this, that);
+}
 forall(dtype T)
+forall(T &)
 int ?==?(unique_ptr(T) const & this, zero_t) {
         return this.data == 0;
+}
 forall(dtype T)
+forall(T &)
 int ?!=?(unique_ptr(T) const & this, zero_t) {
         return !?==?(this, (zero_t)0);

libcfa/src/memory.hfa

-              r342af53
+              r8e4aa05
 // Created On       : Tue Jun  2 16:48:00 2020
 // Last Modified By : Andrew Beach
 // Last Modified On : Tue Jun  3 12:29:00 2020
 // Update Count     : 0
+// Last Modified On : Fri Jan 29 15:52:00 2021
+// Update Count     : 1
 //
 …
 // Internal data object.
 forall(dtype T | sized(T)) {
         struct counter_data {
                 unsigned int counter;
                 T object;
         };
+forall(T & | sized(T))
+struct counter_data {
+        unsigned int counter;
+        T object;
+};
         forall(ttype Args | { void ?{}(T &, Args); })
         void ?{}(counter_data(T) & this, Args args);
+forall(T & | sized(T), Args... | { void ?{}(T &, Args); })
+void ?{}(counter_data(T) & this, Args args);
+        forall( | { void ^?{}(T &); })
+        void ^?{}(counter_data(T) & this);
+}
+forall(T & | sized(T) | { void ^?{}(T &); })
+void ^?{}(counter_data(T) & this);
 // This is one of many pointers keeping this alive.
 forall(dtype T | sized(T)) {
         struct counter_ptr {
                 counter_data(T) * data;
         };
+forall(T & | sized(T))
+struct counter_ptr {
+        counter_data(T) * data;
+};
+        void ?{}(counter_ptr(T) & this);
+        void ?{}(counter_ptr(T) & this, zero_t);
+        forall( | { void ^?{}(T &); })
+        void ?{}(counter_ptr(T) & this, counter_ptr(T) that);
+        forall(ttype Args | { void ?{}(T&, Args); })
+        void ?{}(counter_ptr(T) & this, Args args);
+forall(T & | sized(T))
+void ?{}(counter_ptr(T) & this);
+forall(T & | sized(T))
+void ?{}(counter_ptr(T) & this, zero_t);
+forall(T & | sized(T))
+void ?{}(counter_ptr(T) & this, counter_ptr(T) that);
+forall(T & | sized(T), Args... | { void ?{}(T&, Args); })
+void ?{}(counter_ptr(T) & this, Args args);
         forall( | { void ^?{}(T &); })
         void ^?{}(counter_ptr(T) & this);
+forall(T & | sized(T) | { void ^?{}(T &); })
+void ^?{}(counter_ptr(T) & this);
+        T & *?(counter_ptr(T) & this);
+forall(T & | sized(T))
+T & *?(counter_ptr(T) & this);
         forall( | { void ^?{}(T &); })
         void ?=?(counter_ptr(T) & this, counter_ptr(T) that);
         forall( | { void ^?{}(T &); })
         void ?=?(counter_ptr(T) & this, zero_t);
+forall(T & | sized(T) | { void ^?{}(T &); })
+void ?=?(counter_ptr(T) & this, counter_ptr(T) that);
+forall(T & | sized(T) | { void ^?{}(T &); })
+void ?=?(counter_ptr(T) & this, zero_t);
+        int ?==?(counter_ptr(T) const & this, counter_ptr(T) const & that);
+        int ?!=?(counter_ptr(T) const & this, counter_ptr(T) const & that);
+        int ?==?(counter_ptr(T) const & this, zero_t);
+        int ?!=?(counter_ptr(T) const & this, zero_t);
+}
+forall(T & | sized(T))
+int ?==?(counter_ptr(T) const & this, counter_ptr(T) const & that);
+forall(T & | sized(T))
+int ?!=?(counter_ptr(T) const & this, counter_ptr(T) const & that);
+forall(T & | sized(T))
+int ?==?(counter_ptr(T) const & this, zero_t);
+forall(T & | sized(T))
+int ?!=?(counter_ptr(T) const & this, zero_t);
 // This is the only pointer that keeps this alive.
 forall(dtype T) {
         struct unique_ptr {
                 T * data;
         };
+forall(T &)
+struct unique_ptr {
+        T * data;
+};
+        void ?{}(unique_ptr(T) & this);
+        void ?{}(unique_ptr(T) & this, zero_t);
+        void ?{}(unique_ptr(T) & this, unique_ptr(T) that) = void;
+        forall( | sized(T), ttype Args | { void ?{}(T &, Args); })
+        void ?{}(unique_ptr(T) & this, Args args);
+forall(T &)
+void ?{}(unique_ptr(T) & this);
+forall(T &)
+void ?{}(unique_ptr(T) & this, zero_t);
+forall(T &)
+void ?{}(unique_ptr(T) & this, unique_ptr(T) that) = void;
+forall(T & | sized(T), Args... | { void ?{}(T &, Args); })
+void ?{}(unique_ptr(T) & this, Args args);
         forall( | { void ^?{}(T &); })
         void ^?{}(unique_ptr(T) & this);
+forall(T & | { void ^?{}(T &); })
+void ^?{}(unique_ptr(T) & this);
+        T & *?(unique_ptr(T) & this);
+forall(T & )
+T & *?(unique_ptr(T) & this);
+        void ?=?(unique_ptr(T) & this, unique_ptr(T) that) = void;
+        forall( | { void ^?{}(T &); })
+        void ?=?(unique_ptr(T) & this, zero_t);
+forall(T &)
+void ?=?(unique_ptr(T) & this, unique_ptr(T) that) = void;
+forall(T & | { void ^?{}(T &); })
+void ?=?(unique_ptr(T) & this, zero_t);
         forall( | { void ^?{}(T &); })
         void move(unique_ptr(T) & this, unique_ptr(T) & that);
+forall(T & | { void ^?{}(T &); })
+void move(unique_ptr(T) & this, unique_ptr(T) & that);
+        int ?==?(unique_ptr(T) const & this, unique_ptr(T) const & that);
+        int ?!=?(unique_ptr(T) const & this, unique_ptr(T) const & that);
+        int ?==?(unique_ptr(T) const & this, zero_t);
+        int ?!=?(unique_ptr(T) const & this, zero_t);
+}
+forall(T &)
+int ?==?(unique_ptr(T) const & this, unique_ptr(T) const & that);
+forall(T &)
+int ?!=?(unique_ptr(T) const & this, unique_ptr(T) const & that);
+forall(T &)
+int ?==?(unique_ptr(T) const & this, zero_t);
+forall(T &)
+int ?!=?(unique_ptr(T) const & this, zero_t);

libcfa/src/parseargs.cfa

-              r342af53
+              r8e4aa05
 static void usage(char * cmd, cfa_option options[], size_t opt_count, const char * usage, FILE * out)  __attribute__ ((noreturn));
+//-----------------------------------------------------------------------------
+// checking
+static void check_args(cfa_option options[], size_t opt_count) {
+        for(i; opt_count) {
+                for(j; opt_count) {
+                        if(i == j) continue;
+                        if( options[i].short_name != '\0'
+                        && options[i].short_name == options[j].short_name)
+                                abort("Parse Args error: two options have short name '%c' (%zu & %zu)", options[i].short_name, i, j);
+                        if(0 == strcmp(options[i].long_name, options[j].long_name)) abort("Parse Args error: two options have long name '%s' (%zu & %zu)", options[i].long_name, i, j);
+                }
+        }
+}
+//-----------------------------------------------------------------------------
+// Parsing args
 void parse_args( cfa_option options[], size_t opt_count, const char * usage, char ** & left ) {
         if( 0p != &cfa_args_argc ) {
 …
+}
-//-----------------------------------------------------------------------------
-// getopt_long wrapping
 void parse_args(
         int argc,
 …
         char ** & left
 ) {
+        check_args(options, opt_count);
+        int maxv = 'h';
+        char optstring[opt_count * 3] = { '\0' };
+        {
+                int idx = 0;
+                for(i; opt_count) {
+                        if (options[i].short_name) {
+                                maxv = max(options[i].short_name, maxv);
+                                optstring[idx] = options[i].short_name;
+                                idx++;
+                                if(    ((intptr_t)options[i].parse) != ((intptr_t)parse_settrue)
+                                && ((intptr_t)options[i].parse) != ((intptr_t)parse_setfalse) ) {
+                                        optstring[idx] = ':';
+                                        idx++;
+                                }
+                        }
+                }
+                optstring[idx+0] = 'h';
+                optstring[idx+1] = '\0';
+        }
         struct option optarr[opt_count + 2];
+        {
 …
                 for(i; opt_count) {
                         if(options[i].long_name) {
+                                options[i].val = (options[i].short_name != '\0') ? ((int)options[i].short_name) : ++maxv;
                                 optarr[idx].name = options[i].long_name;
                                 optarr[idx].flag = 0p;
                                 optarr[idx].val  = options[i].short_name;
+                                optarr[idx].val  = options[i].val;
                                 if(    ((intptr_t)options[i].parse) == ((intptr_t)parse_settrue)
                                     || ((intptr_t)options[i].parse) == ((intptr_t)parse_setfalse) ) {
 …
                 optarr[idx+0].[name, has_arg, flag, val] = ["help", no_argument, 0, 'h'];
                 optarr[idx+1].[name, has_arg, flag, val] = [0, no_argument, 0, 0];
+        }
-        char optstring[opt_count * 3] = { '\0' };
+        {
-                int idx = 0;
-                for(i; opt_count) {
-                        optstring[idx] = options[i].short_name;
-                        idx++;
-                        if(    ((intptr_t)options[i].parse) != ((intptr_t)parse_settrue)
-                            && ((intptr_t)options[i].parse) != ((intptr_t)parse_setfalse) ) {
-                                optstring[idx] = ':';
-                                idx++;
+                        }
+                }
-                optstring[idx+0] = 'h';
-                optstring[idx+1] = '\0';
+        }
 …
                         default:
                                 for(i; opt_count) {
                                         if(opt == options[i].short_name) {
+                                        if(opt == options[i].val) {
                                                 const char * arg = optarg ? optarg : "";
                                                 if( arg[0] == '=' ) { arg++; }
 …
         if(hwidth <= 0) hwidth = max;
+        fprintf(out, "  -%c, --%-*s   %.*s\n", sn, width, ln, hwidth, help);
+        char sname[4] = { ' ', ' ', ' ', '\0' };
+        if(sn != '\0') {
+                sname[0] = '-';
+                sname[1] = sn;
+                sname[2] = ',';
+        }
+        fprintf(out, "  %s --%-*s   %.*s\n", sname, width, ln, hwidth, help);
         for() {
                 help += min(strlen(help), hwidth);

libcfa/src/parseargs.hfa

-              r342af53
+              r8e4aa05
 struct cfa_option {
+      int val; // reserved
       char short_name;
       const char * long_name;
 …
 static inline void ?{}( cfa_option & this ) {}
 forall(dtype T | { bool parse(const char *, T & ); })
+forall(T & | { bool parse(const char *, T & ); })
 static inline void ?{}( cfa_option & this, char short_name, const char * long_name, const char * help, T & variable ) {
+      this.val        = 0;
       this.short_name = short_name;
       this.long_name  = long_name;
 …
+}
 forall(dtype T)
+forall(T &)
 static inline void ?{}( cfa_option & this, char short_name, const char * long_name, const char * help, T & variable, bool (*parse)(const char *, T & )) {
+      this.val        = 0;
       this.short_name = short_name;
       this.long_name  = long_name;

libcfa/src/rational.cfa

-              r342af53
+              r8e4aa05
 #include "stdlib.hfa"
 forall( otype RationalImpl | arithmetic( RationalImpl ) ) {
+forall( RationalImpl | arithmetic( RationalImpl ) ) {
         // helper routines
 …
         // I/O
         forall( dtype istype | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
+        forall( istype & | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
         istype & ?|?( istype & is, Rational(RationalImpl) & r ) {
                 is | r.numerator | r.denominator;
 …
         } // ?|?
         forall( dtype ostype | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
+        forall( ostype & | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
                 ostype & ?|?( ostype & os, Rational(RationalImpl) r ) {
                         return os | r.numerator | '/' | r.denominator;
 …
 } // distribution
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { RationalImpl ?\?( RationalImpl, unsigned long ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { RationalImpl ?\?( RationalImpl, unsigned long ); } )
 Rational(RationalImpl) ?\?( Rational(RationalImpl) x, long int y ) {
         if ( y < 0 ) {
 …
 // conversion
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
 double widen( Rational(RationalImpl) r ) {
         return convert( r.numerator ) / convert( r.denominator );
 } // widen
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); RationalImpl convert( double ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); RationalImpl convert( double ); } )
 Rational(RationalImpl) narrow( double f, RationalImpl md ) {
         // http://www.ics.uci.edu/~eppstein/numth/frap.c

libcfa/src/rational.hfa

-              r342af53
+              r8e4aa05
 #include "iostream.hfa"
 trait scalar( otype T ) {
+trait scalar( T ) {
 };
 trait arithmetic( otype T | scalar( T ) ) {
+trait arithmetic( T | scalar( T ) ) {
         int !?( T );
         int ?==?( T, T );
 …
 // implementation
 forall( otype RationalImpl | arithmetic( RationalImpl ) ) {
+forall( RationalImpl | arithmetic( RationalImpl ) ) {
         struct Rational {
                 RationalImpl numerator, denominator;                    // invariant: denominator > 0
 …
         // I/O
         forall( dtype istype | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
+        forall( istype & | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
         istype & ?|?( istype &, Rational(RationalImpl) & );
         forall( dtype ostype | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
+        forall( ostype & | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
                 ostype & ?|?( ostype &, Rational(RationalImpl) );
                 void ?|?( ostype &, Rational(RationalImpl) );
 …
 } // distribution
 forall( otype RationalImpl | arithmetic( RationalImpl ) |{RationalImpl ?\?( RationalImpl, unsigned long );} )
+forall( RationalImpl | arithmetic( RationalImpl ) |{RationalImpl ?\?( RationalImpl, unsigned long );} )
 Rational(RationalImpl) ?\?( Rational(RationalImpl) x, long int y );
 // conversion
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
 double widen( Rational(RationalImpl) r );
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl );  RationalImpl convert( double );} )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl );  RationalImpl convert( double );} )
 Rational(RationalImpl) narrow( double f, RationalImpl md );

libcfa/src/stdlib.cfa

-              r342af53
+              r8e4aa05
 // Cforall allocation/deallocation and constructor/destructor, array types
 forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } )
+forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } )
 T * anew( size_t dim, TT p ) {
         T * arr = alloc( dim );
 …
 } // anew
 forall( dtype T | sized(T) | { void ^?{}( T & ); } )
+forall( T & | sized(T) | { void ^?{}( T & ); } )
 void adelete( T arr[] ) {
         if ( arr ) {                                                                            // ignore null
 …
 } // adelete
 forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype TT | { void adelete( TT ); } )
+forall( T & | sized(T) | { void ^?{}( T & ); }, TT... | { void adelete( TT ); } )
 void adelete( T arr[], TT rest ) {
         if ( arr ) {                                                                            // ignore null
 …
 //---------------------------------------
 forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
         E * bsearch( E key, const E * vals, size_t dim ) {
                 int cmp( const void * t1, const void * t2 ) {
 …
 forall( otype K, otype E | { int ?<?( K, K ); K getKey( const E & ); } ) {
+forall( K, E | { int ?<?( K, K ); K getKey( const E & ); } ) {
         E * bsearch( K key, const E * vals, size_t dim ) {
                 int cmp( const void * t1, const void * t2 ) {

libcfa/src/stdlib.hfa

-              r342af53
+              r8e4aa05
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Dec 12 13:52:34 2020
 // Update Count     : 536
+// Last Modified On : Thu Jan 21 22:02:13 2021
+// Update Count     : 574
 //
 …
         else return (T *)alignment( _Alignof(T), dim, sizeof(T) )
 static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
         // CFA safe equivalents, i.e., implicit size specification
 …
 . Replace the current forall-block that contains defintions of S_fill and S_realloc with following:
                 forall( dtype T | sized(T) ) {
+                forall( T & | sized(T) ) {
                         union  U_fill           { char c; T * a; T t; };
                         struct S_fill           { char tag; U_fill(T) fill; };
 …
 typedef struct S_resize                 { inline void *;  }     T_resize;
 forall( dtype T ) {
+forall( T & ) {
         struct S_fill           { char tag; char c; size_t size; T * at; char t[50]; };
         struct S_realloc        { inline T *; };
 …
 static inline T_resize  ?`resize  ( void * a )  { return (T_resize){a}; }
 static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
         S_fill(T) ?`fill ( T t ) {
                 S_fill(T) ret = { 't' };
                 size_t size = sizeof(T);
+                if(size > sizeof(ret.t)) { printf("ERROR: const object of size greater than 50 bytes given for dynamic memory fill\n"); exit(1); }
+                if ( size > sizeof(ret.t) ) {
+                        abort( "ERROR: const object of size greater than 50 bytes given for dynamic memory fill\n" );
+                } // if
                 memcpy( &ret.t, &t, size );
                 return ret;
 …
         S_realloc(T)    ?`realloc ( T * a )                             { return (S_realloc(T)){a}; }
         T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill) {
+        T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill ) {
                 T * ptr = NULL;
                 size_t size = sizeof(T);
 …
                         ptr = (T*) (void *) resize( (void *)Resize, Align, Dim * size );
                 } else if ( Realloc ) {
                         if (Fill.tag != '0') copy_end = min(malloc_size( Realloc ), Dim * size);
                         ptr = (T*) (void *) realloc( (void *)Realloc, Align, Dim * size );
+                        if ( Fill.tag != '0' ) copy_end = min(malloc_size( Realloc ), Dim * size );
+                        ptr = (T *) (void *) realloc( (void *)Realloc, Align, Dim * size );
                 } else {
                         ptr = (T*) (void *) memalign( Align, Dim * size );
+                }
                 if(Fill.tag == 'c') {
+                        ptr = (T *) (void *) memalign( Align, Dim * size );
+                }
+                if ( Fill.tag == 'c' ) {
                         memset( (char *)ptr + copy_end, (int)Fill.c, Dim * size - copy_end );
                 } else if(Fill.tag == 't') {
+                } else if ( Fill.tag == 't' ) {
                         for ( int i = copy_end; i < Dim * size; i += size ) {
+                                #pragma GCC diagnostic push
+                                #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+                                assert( size <= sizeof(Fill.t) );
                                 memcpy( (char *)ptr + i, &Fill.t, size );
+                                #pragma GCC diagnostic pop
+                        }
                 } else if(Fill.tag == 'a') {
+                } else if ( Fill.tag == 'a' ) {
                         memcpy( (char *)ptr + copy_end, Fill.at, min(Dim * size - copy_end, Fill.size) );
+                } else if(Fill.tag == 'T') {
+                        for ( int i = copy_end; i < Dim * size; i += size ) {
+                                memcpy( (char *)ptr + i, Fill.at, size );
+                        }
+                } else if ( Fill.tag == 'T' ) {
+                        memcpy( (char *)ptr + copy_end, Fill.at, Dim * size );
+                }
 …
         } // $alloc_internal
         forall( ttype TT | { T * $alloc_internal( void *, T *, size_t, size_t, S_fill(T), TT ); } ) {
+        forall( TT... | { T * $alloc_internal( void *, T *, size_t, size_t, S_fill(T), TT ); } ) {
                 T * $alloc_internal( void *       , T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill, T_resize Resize, TT rest) {
 …
 } // distribution T
 static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
         // CFA safe initialization/copy, i.e., implicit size specification, non-array types
         T * memset( T * dest, char fill ) {
 …
 // CFA deallocation for multiple objects
 static inline forall( dtype T )                                                 // FIX ME, problems with 0p in list
+static inline forall( T & )                                                     // FIX ME, problems with 0p in list
 void free( T * ptr ) {
         free( (void *)ptr );                                                            // C free
 } // free
 static inline forall( dtype T, ttype TT | { void free( TT ); } )
+static inline forall( T &, TT... | { void free( TT ); } )
 void free( T * ptr, TT rest ) {
         free( ptr );
 …
 // CFA allocation/deallocation and constructor/destructor, non-array types
 static inline forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } )
+static inline forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } )
 T * new( TT p ) {
         return &(*(T *)malloc()){ p };                                                  // run constructor
+        return &(*(T *)malloc()){ p };                                          // run constructor
 } // new
 static inline forall( dtype T | { void ^?{}( T & ); } )
+static inline forall( T & | { void ^?{}( T & ); } )
 void delete( T * ptr ) {
         // special case for 0-sized object => always call destructor
 …
         free( ptr );                                                                            // always call free
 } // delete
 static inline forall( dtype T, ttype TT | { void ^?{}( T & ); void delete( TT ); } )
+static inline forall( T &, TT... | { void ^?{}( T & ); void delete( TT ); } )
 void delete( T * ptr, TT rest ) {
         delete( ptr );
 …
 // CFA allocation/deallocation and constructor/destructor, array types
 forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } ) T * anew( size_t dim, TT p );
 forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( T arr[] );
 forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype TT | { void adelete( TT ); } ) void adelete( T arr[], TT rest );
+forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } ) T * anew( size_t dim, TT p );
+forall( T & | sized(T) | { void ^?{}( T & ); } ) void adelete( T arr[] );
+forall( T & | sized(T) | { void ^?{}( T & ); }, TT... | { void adelete( TT ); } ) void adelete( T arr[], TT rest );
 //---------------------------------------
 …
 //---------------------------------------
 forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
         E * bsearch( E key, const E * vals, size_t dim );
         size_t bsearch( E key, const E * vals, size_t dim );
 …
 } // distribution
 forall( otype K, otype E | { int ?<?( K, K ); K getKey( const E & ); } ) {
+forall( K, E | { int ?<?( K, K ); K getKey( const E & ); } ) {
         E * bsearch( K key, const E * vals, size_t dim );
         size_t bsearch( K key, const E * vals, size_t dim );
 …
 } // distribution
 forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
         void qsort( E * vals, size_t dim );
 } // distribution

libcfa/src/time.cfa

-              r342af53
+              r8e4aa05
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, Duration dur ) with( dur ) {
                 (ostype &)(os | tn / TIMEGRAN);                                 // print seconds
 …
 } // strftime
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, Time time ) with( time ) {
                 char buf[32];                                                                   // at least 26

libcfa/src/vec/vec.hfa

-              r342af53
+              r8e4aa05
 #include <math.hfa>
 trait fromint(otype T) {
+trait fromint(T) {
     void ?{}(T&, int);
 };
 trait zeroinit(otype T) {
+trait zeroinit(T) {
     void ?{}(T&, zero_t);
 };
 trait zero_assign(otype T) {
+trait zero_assign(T) {
     T ?=?(T&, zero_t);
 };
 trait subtract(otype T) {
+trait subtract(T) {
     T ?-?(T, T);
 };
 trait negate(otype T) {
+trait negate(T) {
     T -?(T);
 };
 trait add(otype T) {
+trait add(T) {
     T ?+?(T, T);
 };
 trait multiply(otype T) {
+trait multiply(T) {
     T ?*?(T, T);
 };
 trait divide(otype T) {
+trait divide(T) {
     T ?/?(T, T);
 };
 trait lessthan(otype T) {
+trait lessthan(T) {
     int ?<?(T, T);
 };
 trait equality(otype T) {
+trait equality(T) {
     int ?==?(T, T);
 };
 trait sqrt(otype T) {
+trait sqrt(T) {
     T sqrt(T);
 };
 …
+}
 trait dottable(otype V, otype T) {
+trait dottable(V, T) {
     T dot(V, V);
 };
 …
 static inline {
 forall(otype T | sqrt(T), otype V | dottable(V, T))
+forall(T | sqrt(T), V | dottable(V, T))
 T length(V v) {
    return sqrt(dot(v, v));
+}
 forall(otype T, otype V | dottable(V, T))
+forall(T, V | dottable(V, T))
 T length_squared(V v) {
    return dot(v, v);
+}
 forall(otype T, otype V | { T length(V); } | subtract(V))
+forall(T, V | { T length(V); } | subtract(V))
 T distance(V v1, V v2) {
     return length(v1 - v2);
+}
 forall(otype T, otype V | { T length(V); V ?/?(V, T); })
+forall(T, V | { T length(V); V ?/?(V, T); })
 V normalize(V v) {
     return v / length(v);
 …
 // Project vector u onto vector v
 forall(otype T, otype V | dottable(V, T) | { V normalize(V); V ?*?(V, T); })
+forall(T, V | dottable(V, T) | { V normalize(V); V ?*?(V, T); })
 V project(V u, V v) {
     V v_norm = normalize(v);
 …
 // Reflect incident vector v with respect to surface with normal n
 forall(otype T | fromint(T), otype V | { V project(V, V); V ?*?(T, V); V ?-?(V,V); })
+forall(T | fromint(T), V | { V project(V, V); V ?*?(T, V); V ?-?(V,V); })
 V reflect(V v, V n) {
     return v - (T){2} * project(v, n);
 …
 // entering material (i.e., from air to water, eta = 1/1.33)
 // v and n must already be normalized
 forall(otype T | fromint(T) | subtract(T) | multiply(T) | add(T) | lessthan(T) | sqrt(T),
        otype V | dottable(V, T) | { V ?*?(T, V); V ?-?(V,V); void ?{}(V&, zero_t); })
+forall(T | fromint(T) | subtract(T) | multiply(T) | add(T) | lessthan(T) | sqrt(T),
+       V | dottable(V, T) | { V ?*?(T, V); V ?-?(V,V); void ?{}(V&, zero_t); })
 V refract(V v, V n, T eta) {
     T dotValue = dot(n, v);
 …
 // i is the incident vector
 // ng is the geometric normal of the surface
 forall(otype T | lessthan(T) | zeroinit(T), otype V | dottable(V, T) | negate(V))
+forall(T | lessthan(T) | zeroinit(T), V | dottable(V, T) | negate(V))
 V faceforward(V n, V i, V ng) {
     return dot(ng, i) < (T){0} ? n : -n;

libcfa/src/vec/vec2.hfa

-              r342af53
+              r8e4aa05
 #include "vec.hfa"
 forall (otype T) {
+forall (T) {
     struct vec2 {
         T x, y;
 …
+}
 forall (otype T) {
+forall (T) {
     static inline {
 …
+}
 forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec2(T) v) with (v) {
         return os | '<' | x | ',' | y | '>';

libcfa/src/vec/vec3.hfa

-              r342af53
+              r8e4aa05
 #include "vec.hfa"
 forall (otype T) {
+forall (T) {
     struct vec3 {
         T x, y, z;
 …
+}
 forall (otype T) {
+forall (T) {
     static inline {
 …
+}
 forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec3(T) v) with (v) {
         return os | '<' | x | ',' | y | ',' | z | '>';

libcfa/src/vec/vec4.hfa

-              r342af53
+              r8e4aa05
 #include "vec.hfa"
 forall (otype T) {
+forall (T) {
     struct vec4 {
         T x, y, z, w;
 …
+}
 forall (otype T) {
+forall (T) {
     static inline {
 …
+}
 forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec4(T) v) with (v) {
         return os | '<' | x | ',' | y | ',' | z | ',' | w | '>';

src/Parser/lex.ll

-              r342af53
+              r8e4aa05
  * Created On       : Sat Sep 22 08:58:10 2001
  * Last Modified By : Peter A. Buhr
  * Last Modified On : Tue Oct  6 18:15:41 2020
  * Update Count     : 743
+ * Last Modified On : Wed Feb 17 08:38:13 2021
+ * Update Count     : 752
  */
 …
 break                   { KEYWORD_RETURN(BREAK); }
 case                    { KEYWORD_RETURN(CASE); }
 catch                   { KEYWORD_RETURN(CATCH); }                              // CFA
 catchResume             { KEYWORD_RETURN(CATCHRESUME); }                // CFA
+catch                   { QKEYWORD_RETURN(CATCH); }                             // CFA
+catchResume             { QKEYWORD_RETURN(CATCHRESUME); }               // CFA
 char                    { KEYWORD_RETURN(CHAR); }
 choose                  { KEYWORD_RETURN(CHOOSE); }                             // CFA
 …
 fallthrough             { KEYWORD_RETURN(FALLTHROUGH); }                // CFA
 fallthru                { KEYWORD_RETURN(FALLTHRU); }                   // CFA
+finally                 { KEYWORD_RETURN(FINALLY); }                    // CFA
+finally                 { QKEYWORD_RETURN(FINALLY); }                   // CFA
+fixup                   { QKEYWORD_RETURN(FIXUP); }                             // CFA
 float                   { KEYWORD_RETURN(FLOAT); }
 __float80               { KEYWORD_RETURN(uuFLOAT80); }                  // GCC
 …
 or                              { QKEYWORD_RETURN(WOR); }                               // CFA
 otype                   { KEYWORD_RETURN(OTYPE); }                              // CFA
+recover                 { QKEYWORD_RETURN(RECOVER); }                   // CFA
 register                { KEYWORD_RETURN(REGISTER); }
+report                  { KEYWORD_RETURN(THROWRESUME); }                // CFA
 restrict                { KEYWORD_RETURN(RESTRICT); }                   // C99
 __restrict              { KEYWORD_RETURN(RESTRICT); }                   // GCC
 …
 __volatile              { KEYWORD_RETURN(VOLATILE); }                   // GCC
 __volatile__    { KEYWORD_RETURN(VOLATILE); }                   // GCC
 waitfor                 { KEYWORD_RETURN(WAITFOR); }
 when                    { KEYWORD_RETURN(WHEN); }
+waitfor                 { KEYWORD_RETURN(WAITFOR); }                    // CFA
+when                    { KEYWORD_RETURN(WHEN); }                               // CFA
 while                   { KEYWORD_RETURN(WHILE); }
 with                    { KEYWORD_RETURN(WITH); }                               // CFA

src/Parser/parser.yy

-              r342af53
+              r8e4aa05
 // Created On       : Sat Sep  1 20:22:55 2001
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Jan 11 21:32:10 2021
 // Update Count     : 4633
+// Last Modified On : Wed Feb 17 09:03:07 2021
+// Update Count     : 4722
 //
 …
 %{
 #define YYDEBUG_LEXER_TEXT (yylval)                                             // lexer loads this up each time
+#define YYDEBUG_LEXER_TEXT( yylval )                                    // lexer loads this up each time
 #define YYDEBUG 1                                                                               // get the pretty debugging code to compile
 #define YYERROR_VERBOSE                                                                 // more information in syntax errors
 …
 extern TypedefTable typedefTable;
 stack< LinkageSpec::Spec > linkageStack;
+stack<LinkageSpec::Spec> linkageStack;
 bool appendStr( string & to, string & from ) {
 …
         ConstantExpr * constant = dynamic_cast<ConstantExpr *>(type->expr.get());
         if ( constant && (constant->get_constant()->get_value() == "0" || constant->get_constant()->get_value() == "1") ) {
         type = new ExpressionNode( new CastExpr( maybeMoveBuild< Expression >(type), new BasicType( Type::Qualifiers(), BasicType::SignedInt ) ) );
+                type = new ExpressionNode( new CastExpr( maybeMoveBuild<Expression>(type), new BasicType( Type::Qualifiers(), BasicType::SignedInt ) ) );
         } // if
         return new ForCtrl(
 …
 %token ATTRIBUTE EXTENSION                                                              // GCC
 %token IF ELSE SWITCH CASE DEFAULT DO WHILE FOR BREAK CONTINUE GOTO RETURN
 %token CHOOSE DISABLE ENABLE FALLTHRU FALLTHROUGH TRY CATCH CATCHRESUME FINALLY THROW THROWRESUME AT WITH WHEN WAITFOR // CFA
+%token CHOOSE DISABLE ENABLE FALLTHRU FALLTHROUGH TRY THROW THROWRESUME AT WITH WHEN WAITFOR // CFA
 %token ASM                                                                                              // C99, extension ISO/IEC 9899:1999 Section J.5.10(1)
 %token ALIGNAS ALIGNOF GENERIC STATICASSERT                             // C11
 // names and constants: lexer differentiates between identifier and typedef names
 %token<tok> IDENTIFIER                  QUOTED_IDENTIFIER               TYPEDEFname                             TYPEGENname
 %token<tok> TIMEOUT                             WOR
 %token<tok> INTEGERconstant             CHARACTERconstant               STRINGliteral
+%token<tok> IDENTIFIER          QUOTED_IDENTIFIER       TYPEDEFname             TYPEGENname
+%token<tok> TIMEOUT                     WOR                                     CATCH                   RECOVER                 CATCHRESUME             FIXUP           FINALLY         // CFA
+%token<tok> INTEGERconstant     CHARACTERconstant       STRINGliteral
 %token<tok> DIRECTIVE
 // Floating point constant is broken into three kinds of tokens because of the ambiguity with tuple indexing and
 …
 %type<decl> type_qualifier type_qualifier_name forall type_qualifier_list_opt type_qualifier_list
 %type<decl> type_specifier type_specifier_nobody
+%type<decl> type_specifier type_specifier_nobody enum_specifier_nobody
 %type<decl> variable_declarator variable_ptr variable_array variable_function
 %type<decl> variable_abstract_declarator variable_abstract_ptr variable_abstract_array variable_abstract_function
 %type<decl> attribute_list_opt attribute_list attribute_name_list attribute attribute_name
+%type<decl> attribute_list_opt attribute_list attribute_opt attribute attribute_name_list attribute_name
 // initializers
 …
 // Order of these lines matters (low-to-high precedence). THEN is left associative over WOR/TIMEOUT/ELSE, WOR is left
 // associative over TIMEOUT/ELSE, and TIMEOUT is left associative over ELSE.
+%precedence THEN        // rule precedence for IF/WAITFOR statement
+%precedence WOR         // token precedence for start of WOR in WAITFOR statement
+%precedence TIMEOUT     // token precedence for start of TIMEOUT in WAITFOR statement
+%precedence ELSE        // token precedence for start of else clause in IF/WAITFOR statement
+%precedence THEN                // rule precedence for IF/WAITFOR statement
+%precedence WOR                 // token precedence for start of WOR in WAITFOR statement
+%precedence TIMEOUT             // token precedence for start of TIMEOUT in WAITFOR statement
+%precedence CATCH               // token precedence for start of TIMEOUT in WAITFOR statement
+%precedence RECOVER             // token precedence for start of TIMEOUT in WAITFOR statement
+%precedence CATCHRESUME // token precedence for start of TIMEOUT in WAITFOR statement
+%precedence FIXUP               // token precedence for start of TIMEOUT in WAITFOR statement
+%precedence FINALLY             // token precedence for start of TIMEOUT in WAITFOR statement
+%precedence ELSE                // token precedence for start of else clause in IF/WAITFOR statement
 // Handle shift/reduce conflict for generic type by shifting the '(' token. For example, this string is ambiguous:
 …
         TIMEOUT
         | WOR
+        | CATCH
+        | RECOVER
+        | CATCHRESUME
+        | FIXUP
+        | FINALLY
+        ;
 …
                 { $$ = $2; }
         | '(' compound_statement ')'                                            // GCC, lambda expression
                 { $$ = new ExpressionNode( new StmtExpr( dynamic_cast< CompoundStmt * >(maybeMoveBuild< Statement >($2) ) ) ); }
+                { $$ = new ExpressionNode( new StmtExpr( dynamic_cast<CompoundStmt *>(maybeMoveBuild<Statement>($2) ) ) ); }
         | type_name '.' identifier                                                      // CFA, nested type
                 { SemanticError( yylloc, "Qualified name is currently unimplemented." ); $$ = nullptr; }
 …
+                {
                         // create a GenericExpr wrapper with one association pair
                         $$ = new GenericExpr( nullptr, { { maybeMoveBuildType($1), maybeMoveBuild<Expression>($3) } } );
+                        $$ = new GenericExpr( nullptr, { { maybeMoveBuildType($1), maybeMoveBuild<Expression>( $3 ) } } );
+                }
         | DEFAULT ':' assignment_expression
                 { $$ = new GenericExpr( nullptr, { { maybeMoveBuild<Expression>($3) } } ); }
+                { $$ = new GenericExpr( nullptr, { { maybeMoveBuild<Expression>( $3 ) } } ); }
+        ;
 postfix_expression:
         primary_expression
+        | postfix_expression '[' assignment_expression ',' comma_expression ']'
+                // { $$ = new ExpressionNode( build_binary_val( OperKinds::Index, $1, new ExpressionNode( build_binary_val( OperKinds::Index, $3, $5 ) ) ) ); }
+                { SemanticError( yylloc, "New array subscript is currently unimplemented." ); $$ = nullptr; }
         | postfix_expression '[' assignment_expression ']'
                 // CFA, comma_expression disallowed in this context because it results in a common user error: subscripting a
 …
                         switch ( $1 ) {
                           case OperKinds::AddressOf:
                                 $$ = new ExpressionNode( new AddressExpr( maybeMoveBuild< Expression >( $2 ) ) );
+                                $$ = new ExpressionNode( new AddressExpr( maybeMoveBuild<Expression>( $2 ) ) );
                                 break;
                           case OperKinds::PointTo:
 …
                                 break;
                           case OperKinds::And:
                                 $$ = new ExpressionNode( new AddressExpr( new AddressExpr( maybeMoveBuild< Expression >( $2 ) ) ) );
+                                $$ = new ExpressionNode( new AddressExpr( new AddressExpr( maybeMoveBuild<Expression>( $2 ) ) ) );
                                 break;
                           default:
 …
                 { $$ = new ExpressionNode( build_unary_ptr( OperKinds::Decr, $2 ) ); }
         | SIZEOF unary_expression
                 { $$ = new ExpressionNode( new SizeofExpr( maybeMoveBuild< Expression >( $2 ) ) ); }
+                { $$ = new ExpressionNode( new SizeofExpr( maybeMoveBuild<Expression>( $2 ) ) ); }
         | SIZEOF '(' type_no_function ')'
                 { $$ = new ExpressionNode( new SizeofExpr( maybeMoveBuildType( $3 ) ) ); }
         | ALIGNOF unary_expression                                                      // GCC, variable alignment
                 { $$ = new ExpressionNode( new AlignofExpr( maybeMoveBuild< Expression >( $2 ) ) ); }
+                { $$ = new ExpressionNode( new AlignofExpr( maybeMoveBuild<Expression>( $2 ) ) ); }
         | ALIGNOF '(' type_no_function ')'                                      // GCC, type alignment
                 { $$ = new ExpressionNode( new AlignofExpr( maybeMoveBuildType( $3 ) ) ); }
 …
                 { $$ = new ExpressionNode( build_keyword_cast( $2, $5 ) ); }
         | '(' VIRTUAL ')' cast_expression                                       // CFA
                 { $$ = new ExpressionNode( new VirtualCastExpr( maybeMoveBuild< Expression >( $4 ), maybeMoveBuildType( nullptr ) ) ); }
+                { $$ = new ExpressionNode( new VirtualCastExpr( maybeMoveBuild<Expression>( $4 ), maybeMoveBuildType( nullptr ) ) ); }
         | '(' VIRTUAL type_no_function ')' cast_expression      // CFA
                 { $$ = new ExpressionNode( new VirtualCastExpr( maybeMoveBuild< Expression >( $5 ), maybeMoveBuildType( $3 ) ) ); }
+                { $$ = new ExpressionNode( new VirtualCastExpr( maybeMoveBuild<Expression>( $5 ), maybeMoveBuildType( $3 ) ) ); }
         | '(' RETURN type_no_function ')' cast_expression       // CFA
                 { SemanticError( yylloc, "Return cast is currently unimplemented." ); $$ = nullptr; }
 …
         assignment_expression
         | comma_expression ',' assignment_expression
                 { $$ = new ExpressionNode( new CommaExpr( maybeMoveBuild< Expression >( $1 ), maybeMoveBuild< Expression >( $3 ) ) ); }
+                { $$ = new ExpressionNode( new CommaExpr( maybeMoveBuild<Expression>( $1 ), maybeMoveBuild<Expression>( $3 ) ) ); }
+        ;
 …
         constant_expression                                                     { $$ = $1; }
         | constant_expression ELLIPSIS constant_expression      // GCC, subrange
                 { $$ = new ExpressionNode( new RangeExpr( maybeMoveBuild< Expression >( $1 ), maybeMoveBuild< Expression >( $3 ) ) ); }
+                { $$ = new ExpressionNode( new RangeExpr( maybeMoveBuild<Expression>( $1 ), maybeMoveBuild<Expression>( $3 ) ) ); }
         | subrange                                                                                      // CFA, subrange
+        ;
 …
                 { $$ = new StatementNode( build_computedgoto( $3 ) ); }
                 // A semantic check is required to ensure fallthru appears only in the body of a choose statement.
     | fall_through_name ';'                                                             // CFA
+        | fall_through_name ';'                                                         // CFA
                 { $$ = new StatementNode( build_branch( BranchStmt::FallThrough ) ); }
     | fall_through_name identifier_or_type_name ';'             // CFA
+        | fall_through_name identifier_or_type_name ';'         // CFA
                 { $$ = new StatementNode( build_branch( $2, BranchStmt::FallThrough ) ); }
         | fall_through_name DEFAULT ';'                                         // CFA
 …
 exception_statement:
         TRY compound_statement handler_clause
+        TRY compound_statement handler_clause                                   %prec THEN
                 { $$ = new StatementNode( build_try( $2, $3, 0 ) ); }
         | TRY compound_statement finally_clause
 …
 handler_key:
         CATCH                                                                           { $$ = CatchStmt::Terminate; }
+        | RECOVER                                                                       { $$ = CatchStmt::Terminate; }
         | CATCHRESUME                                                           { $$ = CatchStmt::Resume; }
+        | FIXUP                                                                         { $$ = CatchStmt::Resume; }
+        ;
 …
 asm_operand:                                                                                    // GCC
         string_literal '(' constant_expression ')'
                 { $$ = new ExpressionNode( new AsmExpr( nullptr, $1, maybeMoveBuild< Expression >( $3 ) ) ); }
+                { $$ = new ExpressionNode( new AsmExpr( nullptr, $1, maybeMoveBuild<Expression>( $3 ) ) ); }
         | '[' IDENTIFIER ']' string_literal '(' constant_expression ')'
                 { $$ = new ExpressionNode( new AsmExpr( $2, $4, maybeMoveBuild< Expression >( $6 ) ) ); }
+                { $$ = new ExpressionNode( new AsmExpr( $2, $4, maybeMoveBuild<Expression>( $6 ) ) ); }
+        ;
 …
         | sue_type_specifier_nobody
         | type_type_specifier
+        ;
+enum_specifier_nobody:                                                                  // type specifier - {...}
+                // Preclude SUE declarations in restricted scopes (see type_specifier_nobody)
+        basic_type_specifier
+        | sue_type_specifier_nobody
+        ;
 …
+        ;
-fred:
-        // empty
-                { yyy = false; }
+        ;
 aggregate_type:                                                                                 // struct, union
         aggregate_key attribute_list_opt
 …
           '{' field_declaration_list_opt '}' type_parameters_opt
                 { $$ = DeclarationNode::newAggregate( $1, nullptr, $7, $5, true )->addQualifiers( $2 ); }
         | aggregate_key attribute_list_opt identifier fred
+        | aggregate_key attribute_list_opt identifier
+                {
                         typedefTable.makeTypedef( *$3, forall || typedefTable.getEnclForall() ? TYPEGENname : TYPEDEFname ); // create typedef
 …
+                }
           '{' field_declaration_list_opt '}' type_parameters_opt
                 { $$ = DeclarationNode::newAggregate( $1, $3, $9, $7, true )->addQualifiers( $2 ); }
         | aggregate_key attribute_list_opt type_name fred
+                { $$ = DeclarationNode::newAggregate( $1, $3, $8, $6, true )->addQualifiers( $2 ); }
+        | aggregate_key attribute_list_opt type_name
+                {
                         // for type_name can be a qualified type name S.T, in which case only the last name in the chain needs a typedef (other names in the chain should already have one)
 …
+                }
           '{' field_declaration_list_opt '}' type_parameters_opt
                 { $$ = DeclarationNode::newAggregate( $1, $3->type->symbolic.name, $9, $7, true )->addQualifiers( $2 ); }
+                { $$ = DeclarationNode::newAggregate( $1, $3->type->symbolic.name, $8, $6, true )->addQualifiers( $2 ); }
         | aggregate_type_nobody
+        ;
 …
 aggregate_type_nobody:                                                                  // struct, union - {...}
         aggregate_key attribute_list_opt identifier fred
+        aggregate_key attribute_list_opt identifier
+                {
                         typedefTable.makeTypedef( *$3, forall || typedefTable.getEnclForall() ? TYPEGENname : TYPEDEFname );
 …
                         $$ = DeclarationNode::newAggregate( $1, $3, nullptr, nullptr, false )->addQualifiers( $2 );
+                }
         | aggregate_key attribute_list_opt type_name fred
+        | aggregate_key attribute_list_opt type_name
+                {
                         forall = false;                                                         // reset
 …
+        ;
+// Cannot use attribute_list_opt because of ambiguity with enum_specifier_nobody, which already parses attribute.
+// Hence, only a single attribute is allowed after the "ENUM".
 enum_type:                                                                                              // enum
         ENUM attribute_list_opt '{' enumerator_list comma_opt '}'
+        ENUM attribute_opt '{' enumerator_list comma_opt '}'
                 { $$ = DeclarationNode::newEnum( nullptr, $4, true )->addQualifiers( $2 ); }
         | ENUM attribute_list_opt identifier
+        | ENUM attribute_opt identifier
                 { typedefTable.makeTypedef( *$3 ); }
           '{' enumerator_list comma_opt '}'
                 { $$ = DeclarationNode::newEnum( $3, $6, true )->addQualifiers( $2 ); }
         | ENUM attribute_list_opt type_name
+        | ENUM attribute_opt typedef                                            // enum cannot be generic
           '{' enumerator_list comma_opt '}'
+                { $$ = DeclarationNode::newEnum( $3->type->symbolic.name, $5, true )->addQualifiers( $2 ); }
+                { $$ = DeclarationNode::newEnum( $3->name, $5, true )->addQualifiers( $2 ); }
+        | ENUM enum_specifier_nobody '{' enumerator_list comma_opt '}'
+                // { $$ = DeclarationNode::newEnum( nullptr, $4, true ); }
+                { SemanticError( yylloc, "Typed enumeration is currently unimplemented." ); $$ = nullptr; }
+        | ENUM enum_specifier_nobody declarator '{' enumerator_list comma_opt '}'
+                // {
+                //      typedefTable.makeTypedef( *$3->name );
+                //      $$ = DeclarationNode::newEnum( nullptr, $5, true );
+                // }
+                { SemanticError( yylloc, "Typed enumeration is currently unimplemented." ); $$ = nullptr; }
         | enum_type_nobody
+        ;
 enum_type_nobody:                                                                               // enum - {...}
         ENUM attribute_list_opt identifier
+        ENUM attribute_opt identifier
+                {
                         typedefTable.makeTypedef( *$3 );
                         $$ = DeclarationNode::newEnum( $3, 0, false )->addQualifiers( $2 );
+                }
         | ENUM attribute_list_opt type_name
+        | ENUM attribute_opt type_name                                          // enum cannot be generic
+                {
                         typedefTable.makeTypedef( *$3->type->symbolic.name );
 …
         // empty
                 { $$ = nullptr; }
+        | '=' constant_expression
+                { $$ = $2; }
+        // | '=' constant_expression
+        //      { $$ = $2; }
+        | '=' initializer
+                { $$ = $2->get_expression(); }                                  // FIX ME: enum only deals with constant_expression
+        ;
 …
                 { $$ = $3; }
         | '[' push constant_expression ELLIPSIS constant_expression pop ']' // GCC, multiple array elements
                 { $$ = new ExpressionNode( new RangeExpr( maybeMoveBuild< Expression >( $3 ), maybeMoveBuild< Expression >( $5 ) ) ); }
+                { $$ = new ExpressionNode( new RangeExpr( maybeMoveBuild<Expression>( $3 ), maybeMoveBuild<Expression>( $5 ) ) ); }
         | '.' '[' push field_name_list pop ']'                          // CFA, tuple field selector
                 { $$ = $4; }
 …
 type_parameter:                                                                                 // CFA
         type_class identifier_or_type_name
+                { typedefTable.addToScope( *$2, TYPEDEFname, "9" ); }
+                {
+                        typedefTable.addToScope( *$2, TYPEDEFname, "9" );
+                        if ( $1 == TypeDecl::Otype ) { SemanticError( yylloc, "otype keyword is deprecated, use T " ); }
+                        if ( $1 == TypeDecl::Dtype ) { SemanticError( yylloc, "dtype keyword is deprecated, use T &" ); }
+                        if ( $1 == TypeDecl::Ttype ) { SemanticError( yylloc, "ttype keyword is deprecated, use T ..." ); }
+                }
           type_initializer_opt assertion_list_opt
                 { $$ = DeclarationNode::newTypeParam( $1, $2 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
 …
 subrange:
         constant_expression '~' constant_expression                     // CFA, integer subrange
                 { $$ = new ExpressionNode( new RangeExpr( maybeMoveBuild< Expression >( $1 ), maybeMoveBuild< Expression >( $3 ) ) ); }
+                { $$ = new ExpressionNode( new RangeExpr( maybeMoveBuild<Expression>( $1 ), maybeMoveBuild<Expression>( $3 ) ) ); }
+        ;
 …
         | attribute_list attribute
                 { $$ = $2->addQualifiers( $1 ); }
+        ;
+attribute_opt:
+        // empty
+                { $$ = nullptr; }
+        | attribute
+        ;
 …
         | '[' ']' multi_array_dimension
                 { $$ = DeclarationNode::newArray( 0, 0, false )->addArray( $3 ); }
+        | '[' push assignment_expression pop ',' comma_expression ']'
+                { $$ = DeclarationNode::newArray( $3, 0, false )->addArray( DeclarationNode::newArray( $6, 0, false ) ); }
+                // { SemanticError( yylloc, "New array dimension is currently unimplemented." ); $$ = nullptr; }
         | multi_array_dimension
+        ;

src/ResolvExpr/PolyCost.cc

-              r342af53
+              r8e4aa05
                 PassVisitor<PolyCost> coster( env, indexer );
                 type->accept( coster );
                 return coster.pass.result;
+                return (coster.pass.result > 0) ? 1 : 0;
+        }
 …
         ast::Pass<PolyCost_new> costing( symtab, env );
         type->accept( costing );
         return costing.core.result;
+        return (costing.core.result > 0) ? 1 : 0;
+}

src/ResolvExpr/SpecCost.cc

-              r342af53
+              r8e4aa05
                 // mark specialization of base type
                 void postvisit(ReferenceType*) { if ( count >= 0 ) ++count; }
+                void postvisit(StructInstType*) { if ( count >= 0 ) ++count; }
+                void postvisit(UnionInstType*) { if ( count >= 0 ) ++count; }
         private:
 …
                 void previsit(StructInstType* sty) {
                         count = minover( sty->parameters );
-                        visit_children = false;
+                }
 …
                 void previsit(UnionInstType* uty) {
                         count = minover( uty->parameters );
-                        visit_children = false;
+                }
 …
                 void postvisit( const ast::ArrayType * ) { if ( count >= 0 ) ++count; }
                 void postvisit( const ast::ReferenceType * ) { if ( count >= 0 ) ++count; }
+                void postvisit( const ast::StructInstType * ) { if ( count >= 0 ) ++count; }
+                void postvisit( const ast::UnionInstType * ) { if ( count >= 0 ) ++count; }
                 // Use the minimal specialization value over returns and params.
 …
                 void previsit( const ast::StructInstType * sty ) {
                         count = minimumPresent( sty->params, expr_result );
-                        visit_children = false;
+                }
 …
                 void previsit( const ast::UnionInstType * uty ) {
                         count = minimumPresent( uty->params, expr_result );
-                        visit_children = false;
+                }

src/main.cc

-              r342af53
+              r8e4aa05
 // Created On       : Fri May 15 23:12:02 2015
 // Last Modified By : Andrew Beach
 // Last Modified On : Mon Dec  7 15:29:00 2020
 // Update Count     : 639
+// Last Modified On : Fri Feb 19 14:59:00 2021
+// Update Count     : 643
 //
 …
 static void parse( FILE * input, LinkageSpec::Spec linkage, bool shouldExit = false );
 static void dump( list< Declaration * > & translationUnit, ostream & out = cout );
+static void dump( ast::TranslationUnit && transUnit, ostream & out = cout );
 static void backtrace( int start ) {                                    // skip first N stack frames
 …
                         PASS( "Resolve", ResolvExpr::resolve( transUnit ) );
                         if ( exprp ) {
+                                translationUnit = convert( move( transUnit ) );
+                                dump( translationUnit );
+                                dump( move( transUnit ) );
                                 return EXIT_SUCCESS;
                         } // if
 …
 static const char * description[] = {
         "diagnostic color: never, always, or auto.",            // -c
+        "diagnostic color: never, always, auto",                        // -c
         "wait for gdb to attach",                                                       // -g
         "print help message",                                                           // -h
+        "print translator help message",                                        // -h
         "generate libcfa.c",                                                            // -l
         "generate line marks",                                                          // -L
 …
         "do not generate line marks",                                           // -N
         "do not read prelude",                                                          // -n
         "generate prototypes for prelude functions",            // -p
+        "do not generate prelude prototypes => prelude not printed", // -p
         "only print deterministic output",                  // -d
         "Use the old-ast",                                                                      // -O
 …
         "print",                                                                                        // -P
         "<directory> prelude directory for debug/nodebug",      // no flag
         "<option-list> enable profiling information:\n          counters,heap,time,all,none", // -S
+        "<option-list> enable profiling information: counters, heap, time, all, none", // -S
         "building cfa standard lib",                                            // -t
         "",                                                                                                     // -w
 …
 } // dump
+static void dump( ast::TranslationUnit && transUnit, ostream & out ) {
+        std::list< Declaration * > translationUnit = convert( move( transUnit ) );
+        dump( translationUnit, out );
+}
 // Local Variables: //
 // tab-width: 4 //

tests/.expect/attributes.nast.x64.txt

-              r342af53
+              r8e4aa05
+}
 struct __attribute__ ((unused)) __anonymous0 {
+struct __anonymous0 {
 };
 static inline void _X12_constructorFv_S12__anonymous0_autogen___1(struct __anonymous0 *_X4_dstS12__anonymous0_1);
 …
     return _X4_retS12__anonymous0_1;
+}
+__attribute__ ((unused)) struct __anonymous0 _X5DummyS12__anonymous0_1;
 struct __attribute__ ((unused)) Agn1;
 struct __attribute__ ((unused)) Agn2 {

tests/.expect/attributes.nast.x86.txt

-              r342af53
+              r8e4aa05
+}
 struct __attribute__ ((unused)) __anonymous0 {
+struct __anonymous0 {
 };
 static inline void _X12_constructorFv_S12__anonymous0_autogen___1(struct __anonymous0 *_X4_dstS12__anonymous0_1);
 …
     return _X4_retS12__anonymous0_1;
+}
+__attribute__ ((unused)) struct __anonymous0 _X5DummyS12__anonymous0_1;
 struct __attribute__ ((unused)) Agn1;
 struct __attribute__ ((unused)) Agn2 {

tests/.expect/attributes.oast.x64.txt

-              r342af53
+              r8e4aa05
+}
 struct __attribute__ ((unused)) __anonymous0 {
+struct __anonymous0 {
 };
 static inline void _X12_constructorFv_S12__anonymous0_autogen___1(struct __anonymous0 *_X4_dstS12__anonymous0_1);
 …
     return _X4_retS12__anonymous0_1;
+}
+__attribute__ ((unused)) struct __anonymous0 _X5DummyS12__anonymous0_1;
 struct __attribute__ ((unused)) Agn1;
 struct __attribute__ ((unused)) Agn2 {

tests/.expect/attributes.oast.x86.txt

-              r342af53
+              r8e4aa05
+}
 struct __attribute__ ((unused)) __anonymous0 {
+struct __anonymous0 {
 };
 static inline void _X12_constructorFv_S12__anonymous0_autogen___1(struct __anonymous0 *_X4_dstS12__anonymous0_1);
 …
     return _X4_retS12__anonymous0_1;
+}
+__attribute__ ((unused)) struct __anonymous0 _X5DummyS12__anonymous0_1;
 struct __attribute__ ((unused)) Agn1;
 struct __attribute__ ((unused)) Agn2 {

tests/Makefile.am

-              r342af53
+              r8e4aa05
 ## Created On       : Sun May 31 09:08:15 2015
 ## Last Modified By : Peter A. Buhr
 ## Last Modified On : Fri Oct  9 23:13:07 2020
 ## Update Count     : 86
+## Last Modified On : Tue Mar  2 21:39:01 2021
+## Update Count     : 90
 ###############################################################################
 …
         -Wall \
         -Wno-unused-function \
+        -quiet @CFA_FLAGS@ \
+        -DIN_DIR="${abs_srcdir}/.in/"
+        -quiet @CFA_FLAGS@
 AM_CFAFLAGS = -XCFA --deterministic-out
 …
         long_tests.hfa \
         .in/io.data \
+        io/.in/io.data \
         avltree/avl.h \
         avltree/avl-private.h \
 …
 # don't use distcc to do the linking because distcc doesn't do linking
 % : %.cfa $(CFACCBIN)
         $(CFACOMPILETEST) -c -o $(abspath ${@}).o
+        $(CFACOMPILETEST) -c -o $(abspath ${@}).o -DIN_DIR="$(abspath $(dir ${<}))/.in/"
         $(CFACCLINK) ${@}.o -o $(abspath ${@})
         rm $(abspath ${@}).o
 …
 SYNTAX_ONLY_CODE = expression typedefRedef variableDeclarator switch numericConstants identFuncDeclarator forall \
         init1 limits nested-types stdincludes cast labelledExit array builtins/sync warnings/self-assignment
+        init1 limits nested-types stdincludes cast labelledExit array quasiKeyword include/includes builtins/sync warnings/self-assignment
 $(SYNTAX_ONLY_CODE): % : %.cfa $(CFACCBIN)
         $(CFACOMPILE_SYNTAX)

tests/alloc2.cfa

r342af53	r8e4aa05
16	16	bool passed = (malloc_size(ip) == size) && (malloc_usable_size(ip) >= size) && (malloc_alignment(ip) == align) && ((uintptr_t)ip % align == 0);
17	17	if (!passed) {
18		printf("failed test %3d: %4~~lu %4lu but got %4lu ( %3lu ) %4l~~u\n", tests_total, size, align, malloc_size(ip), malloc_usable_size(ip), malloc_alignment(ip));
	18	printf("failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size(ip), malloc_usable_size(ip), malloc_alignment(ip));
19	19	tests_failed += 1;
20	20	}

tests/attributes.cfa

-              r342af53
+              r8e4aa05
 // Created On       : Mon Feb  6 16:07:02 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov  6 17:51:12 2018
 // Update Count     : 17
+// Last Modified On : Mon Jan 25 21:26:41 2021
+// Update Count     : 20
 //
 …
 // aggregate_name
 struct __attribute__(( unused )) {};
+struct __attribute__(( unused )) {} Dummy;
 struct __attribute__(( unused )) Agn1;
 struct __attribute__(( unused )) Agn2 {};

tests/avltree/avl-private.cfa

-              r342af53
+              r8e4aa05
 // an AVL tree's height is easy to compute
 // just follow path with the larger balance
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int height(tree(K, V) * t){
   int helper(tree(K, V) * t, int ht){
 …
+}
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int calcBalance(tree(K, V) * t){
   int l = height(t->left);
 …
 // re-establish the link between parent and child
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void relinkToParent(tree(K, V) * t){
   tree(K, V) * parent = t->parent; // FIX ME!!
 …
 // rotate left from t
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * rotateLeft(tree(K, V) * t){
   tree(K, V) * newRoot = t->right;
 …
 // rotate right from t
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * rotateRight(tree(K, V) * t){
   tree(K, V) * newRoot = t->left;
 …
 // balances a node that has balance factor -2 or 2
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * fix(tree(K, V) * t){
   // ensure that t's balance factor is one of
 …
 // attempt to fix the tree, if necessary
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * tryFix(tree(K, V) * t){
   int b = calcBalance(t);
 …
 // sets parent field of c to be p
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void setParent(tree(K, V) * c, tree(K, V) * p){
   if (! empty(c)){

tests/avltree/avl-private.h

-              r342af53
+              r8e4aa05
 // attempt to fix the tree, if necessary
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * tryFix(tree(K, V) * t);
 // sets parent field of c to be p
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void setParent(tree(K, V) * c, tree(K, V) * p);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int height(tree(K, V) * t);

tests/avltree/avl.h

-              r342af53
+              r8e4aa05
 // #include <lib.h>
 trait Comparable(otype T) {
+trait Comparable(T) {
   int ?<?(T, T);
 };
 forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?==?(T t1, T t2);
 forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?>?(T t1, T t2);
 …
 // temporary: need forward decl to get around typedef problem
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 struct tree;
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 struct tree {
   K key;
 …
 };
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void ?{}(tree(K, V) &t, K key, V value);
 forall(otype K, otype V)
+forall(K | Comparable(K), V)
 void ^?{}(tree(K, V) & t);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * create(K key, V value);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 V * find(tree(K, V) * t, K key);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int empty(tree(K, V) * t);
 // returns the root of the tree
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int insert(tree(K, V) ** t, K key, V value);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int remove(tree(K, V) ** t, K key);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void copy(tree(K, V) * src, tree(K, V) ** ret);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void for_each(tree(K, V) * t, void (*func)(V));

tests/avltree/avl0.cfa

-              r342af53
+              r8e4aa05
 #include "avl.h"
 forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?==?(T t1, T t2) {
   return !(t1 < t2) && !(t2 < t1);
+}
 forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?>?(T t1, T t2) {
   return t2 < t1;

tests/avltree/avl1.cfa

-              r342af53
+              r8e4aa05
 #include <stdlib.hfa>
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void ?{}(tree(K, V) &t, K key, V value){
   (t.key) { key };
 …
+}
 forall(otype K, otype V)
+forall(K| Comparable(K), V)
 void ^?{}(tree(K, V) & t){
   delete(t.left);
 …
+}
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * create(K key, V value) {
   // infinite loop trying to resolve ... t = malloc();

tests/avltree/avl2.cfa

-              r342af53
+              r8e4aa05
 #include "avl-private.h"
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 V * find(tree(K, V) * t, K key){
   if (empty(t)){
 …
+}
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int empty(tree(K, V) * t){
   return t == NULL;
 …
 // returns the root of the tree
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int insert(tree(K, V) ** t, K key, V value) {
   // handles a non-empty tree

tests/avltree/avl3.cfa

-              r342af53
+              r8e4aa05
 // swaps the data within two tree nodes
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void node_swap(tree(K, V) * t, tree(K, V) * t2){
         swap( t->key,  t2->key);
 …
 // go left as deep as possible from within the right subtree
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * find_successor(tree(K, V) * t){
         tree(K, V) * find_successor_helper(tree(K, V) * t){
 …
 // cleanup - don't want to deep delete, so set children to NULL first.
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void deleteSingleNode(tree(K, V) * t) {
         t->left = NULL;
 …
 // does the actual remove operation once we've found the node in question
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * remove_node(tree(K, V) * t){
         // is the node a leaf?
 …
 // finds the node that needs to be removed
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * remove_helper(tree(K, V) * t, K key, int * worked){
         if (empty(t)){
 …
+}
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int remove(tree(K, V) ** t, K key){
         int worked = 0;

tests/avltree/avl4.cfa

-              r342af53
+              r8e4aa05
 // Perform a shallow copy of src, return the
 // new tree in ret
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int copy(tree(K, V) * src, tree(K, V) ** ret){
   tree(K, V) * helper(tree(K, V) * t, int * worked){
 …
 // Apply func to every value element in t, using an in order traversal
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void for_each(tree(K, V) * t, int (*func)(V)) {
   if (t == NULL) {

tests/bugs/10.cfa

r342af53	r8e4aa05
2	2	// https://cforall.uwaterloo.ca/trac/ticket/10
3	3
4		forall(~~otype~~ T)
	4	forall(T)
5	5	struct result {
6	6	union {

tests/bugs/104.cfa

r342af53	r8e4aa05
4	4	[ float, float ] modf_( float x );
5	5
6		forall(~~otype~~ T \| { [T, T] modf_(T); })
	6	forall(T \| { [T, T] modf_(T); })
7	7	void modf(T);
8	8

tests/bugs/194.cfa

-              r342af53
+              r8e4aa05
 // https://cforall.uwaterloo.ca/trac/ticket/194
 forall( dtype T | sized(T) ) T * foo( void ) {
+forall( T & | sized(T) ) T * foo( void ) {
       printf( "foo1\n" );
         return (T *)0;
+}
 forall( dtype T | sized(T) ) T & foo( void ) {
+forall( T & | sized(T) ) T & foo( void ) {
         printf( "foo2\n" );
         return (T &)*(T *)0;

tests/bugs/196.cfa

-              r342af53
+              r8e4aa05
 // https://cforall.uwaterloo.ca/trac/ticket/196
 forall(dtype T)
+forall(T &)
 struct link;
 forall(dtype T)
+forall(T &)
 struct link {
         link(T) * next;
 …
 // -----
 forall(dtype T)
+forall(T &)
 struct foo;
 forall(dtype U)
+forall(U &)
 struct bar {
         foo(U) * data;
 };
 forall(dtype T)
+forall(T &)
 struct foo {};

tests/bugs/203-2.cfa

-              r342af53
+              r8e4aa05
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 forall(dtype A)
+forall(A &)
 struct empty {
         // Nothing.
 };
 forall(dtype C)
+forall(C &)
 struct wrap_e {
         empty(C) field;

tests/bugs/203-7.cfa

-              r342af53
+              r8e4aa05
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 forall(dtype A)
+forall(A &)
 struct empty {
         // Nothing.
 };
 forall(dtype C)
+forall(C &)
 struct wrap_e {
         empty(C) field;

tests/bugs/203-9.cfa

-              r342af53
+              r8e4aa05
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 forall(dtype A)
+forall(A &)
 struct empty {
         // Nothing.
 };
 forall(dtype C)
+forall(C &)
 struct wrap_e {
         empty(C) field;

tests/bugs/7.cfa

-              r342af53
+              r8e4aa05
 // (Bug 1 unresolved as of this test.)
 forall(otype T)
+forall(T)
 struct stack_node;
 forall(otype T)
+forall(T)
 struct stack_node {
     stack_node(T) * next;
 …
 };
 forall(otype T)
+forall(T)
 struct stack {
     stack_node(T) * head;
 };
 trait stack_errors(otype T) {
+trait stack_errors(T) {
     T emptyStackHandler (stack(T) * this);
 };
 forall(otype T | stack_errors(T))
+forall(T | stack_errors(T))
 T pop (stack(T) * this) {
     return (T){};

tests/castError.cfa

r342af53	r8e4aa05
14	14	//
15	15
16		forall(~~otype~~ T) struct S { T p; };
	16	forall(T) struct S { T p; };
17	17	int f;
18	18	S(int) sint;

tests/concurrent/examples/boundedBufferEXT.cfa

r342af53	r8e4aa05
24	24	enum { BufferSize = 50 };
25	25
26		forall( ~~otype~~ T ) {
	26	forall( T ) {
27	27	monitor Buffer {
28	28	int front, back, count;

tests/concurrent/examples/boundedBufferINT.cfa

r342af53	r8e4aa05
24	24	enum { BufferSize = 50 };
25	25
26		forall( ~~otype~~ T ) {
	26	forall( T ) {
27	27	monitor Buffer {
28	28	condition full, empty;

tests/concurrent/examples/quickSort.generic.cfa

r342af53	r8e4aa05
21	21	#include <string.h> // strcmp
22	22
23		forall( ~~otype~~ T \| { int ?<?( T, T ); } ) {
	23	forall( T \| { int ?<?( T, T ); } ) {
24	24	thread Quicksort {
25	25	T * values; // communication variables

tests/concurrent/multi-monitor.cfa

r342af53	r8e4aa05
38	38	}
39	39
40		forall(~~dtype T~~ \| sized(T) \| { void ^?{}(T & mutex); })
	40	forall(T & \| sized(T) \| { void ^?{}(T & mutex); })
41	41	void delete_mutex(T * x) {
42	42	^(*x){};

tests/concurrent/thread.cfa

r342af53	r8e4aa05
1	1	#include <fstream.hfa>
2	2	#include <kernel.hfa>
	3	#include <locks.hfa>
3	4	#include <stdlib.hfa>
4	5	#include <thread.hfa>

tests/errors/completeType.cfa

-              r342af53
+              r8e4aa05
 void foo(int *) {}
 void bar(void *) {}
 forall(otype T) void baz(T *);
 forall(dtype T) void qux(T *);
 forall(dtype T | sized(T)) void quux(T *);
+forall(T) void baz(T *);
+forall(T &) void qux(T *);
+forall(T & | sized(T)) void quux(T *);
 struct A;       // incomplete
 …
 forall(otype T)
+forall(T)
 void baz(T * x) {
         // okay
 …
+}
 forall(dtype T)
+forall(T &)
 void qux(T * y) {
         // okay
 …
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 void quux(T * z) {
         // okay

tests/exceptions/defaults.cfa

r342af53	r8e4aa05
55	55
56	56	void unhandled_test(void) {
57		forall(~~dtype T, dtype V~~ \| is_exception(T, V))
	57	forall(T &, V & \| is_exception(T, V))
58	58	void defaultTerminationHandler(T &) {
59	59	throw (unhandled_exception){};

tests/exceptions/polymorphic.cfa

-              r342af53
+              r8e4aa05
 #include <exception.hfa>
 FORALL_TRIVIAL_EXCEPTION(proxy, (otype T), (T));
 FORALL_TRIVIAL_INSTANCE(proxy, (otype U), (U))
+FORALL_TRIVIAL_EXCEPTION(proxy, (T), (T));
+FORALL_TRIVIAL_INSTANCE(proxy, (U), (U))
 const char * msg(proxy(int) * this) { return "proxy(int)"; }
 …
+}
 FORALL_DATA_EXCEPTION(cell, (otype T), (T))(
+FORALL_DATA_EXCEPTION(cell, (T), (T))(
         T data;
 );
 FORALL_DATA_INSTANCE(cell, (otype T), (T))
+FORALL_DATA_INSTANCE(cell, (T), (T))
 const char * msg(cell(int) * this) { return "cell(int)"; }

tests/exceptions/virtual-poly.cfa

-              r342af53
+              r8e4aa05
 };
 forall(otype T)
+forall(T)
 struct mono_child_vtable {
         mono_base_vtable const * const parent;
 };
 forall(otype T)
+forall(T)
 struct mono_child {
         mono_child_vtable(T) const * virtual_table;
 …
+}
 forall(otype U)
+forall(U)
 struct poly_base_vtable {
         poly_base_vtable(U) const * const parent;
 };
 forall(otype U)
+forall(U)
 struct poly_base {
         poly_base_vtable(U) const * virtual_table;
 };
 forall(otype V)
+forall(V)
 struct poly_child_vtable {
         poly_base_vtable(V) const * const parent;
 };
 forall(otype V)
+forall(V)
 struct poly_child {
         poly_child_vtable(V) const * virtual_table;

tests/forall.cfa

-              r342af53
+              r8e4aa05
 void g1() {
         forall( otype T ) T f( T ) {};
+        forall( T ) T f( T ) {};
         void f( int ) {};
         void h( void (*p)(void) ) {};
 …
 void g2() {
         forall( otype T ) void f( T, T ) {}
         forall( otype T, otype U ) void f( T, U ) {}
+        forall( T ) void f( T, T ) {}
+        forall( T, U ) void f( T, U ) {}
         int x;
 …
+}
 typedef forall ( otype T ) int (* f)( int );
 forall( otype T )
+typedef forall ( T ) int (* f)( int );
+forall( T )
 void swap( T left, T right ) {
         T temp = left;
 …
+}
 trait sumable( otype T ) {
+trait sumable( T ) {
         void ?{}( T &, zero_t );                                                        // 0 literal constructor
         T ?+?( T, T );                                                                          // assortment of additions
 …
 }; // sumable
 forall( otype T | sumable( T ) )                                                // use trait
+forall( T | sumable( T ) )                                              // use trait
 T sum( size_t size, T a[] ) {
         T total = 0;                                                                            // initialize by 0 constructor
 …
 } // sum
 forall( otype T | { T ?+?( T, T ); T ?++( T & ); [T] ?+=?( T &,T ); } )
+forall( T | { T ?+?( T, T ); T ?++( T & ); [T] ?+=?( T &,T ); } )
 T twice( T t ) {
         return t + t;
+}
 forall( otype T | { int ?<?(T, T); } )
+forall( T | { int ?<?(T, T); } )
 T min( T t1, T t2 ) {
         return t1 < t2 ? t1 : t2;
 …
 // Multiple forall
 forall( otype T ) forall( otype S ) struct { int i; };
 forall( otype T ) struct { int i; } forall( otype S );
 struct { int i; } forall( otype T ) forall( otype S );
 forall( otype W ) struct { int i; } forall( otype T ) forall( otype S );
+forall( T ) forall( S ) struct { int i; };
+forall( T ) struct { int i; } forall( S );
+struct { int i; } forall( T ) forall( S );
+forall( W ) struct { int i; } forall( T ) forall( S );
 // Distribution
 struct P { int i; };
 forall( otype T ) struct Q { T i; };
 forall( otype T ) struct { int i; };
+forall( T ) struct Q { T i; };
+forall( T ) struct { int i; };
 struct KK { int i; };
 inline static {
         void RT1() {}
+}
 forall( otype T ) {
+forall( T ) {
         T RT2( T ) {
                 typedef int TD1;
                 struct S1 { T t; };
+        }
         forall( otype X ) {
+        forall( X ) {
                 typedef int TD2;
                 struct S2 {};
 …
+        }
         extern "C" {
                 forall( otype W ) {
+                forall( W ) {
                         W RT3( W ) {}
                         struct S3 {};
 …
+        }
         void RT4() {
                 forall( otype W ) struct S4 {};
+                forall( W ) struct S4 {};
                 typedef int TD3;
+        }
 …
 static inline {
         forall( otype T ) {
+        forall( T ) {
                 int RT6( T p );
+        }
         forall( otype T, otype U ) {
+        forall( T, U ) {
                 int RT7( T, U );
+        }
+}
 static forall( otype T ) {
+static forall( T ) {
         int RT8( T );
+}
 forall( otype T ) inline static {
+forall( T ) inline static {
         int RT9( T ) { T t; return 3; }
+}
 forall( otype T | { T ?+?( T, T ); } ) {
         forall( otype S | { T ?+?( T, S ); } ) {
                 forall( otype W ) T bar( T t, S s ) { return t + s; }
                 forall( otype W | { W ?+?( T, W ); } ) W baz( T t, S s, W w ) { return t + s + w; }
+forall( T | { T ?+?( T, T ); } ) {
+        forall( S | { T ?+?( T, S ); } ) {
+                forall( W ) T bar( T t, S s ) { return t + s; }
+                forall( W | { W ?+?( T, W ); } ) W baz( T t, S s, W w ) { return t + s + w; }
                 struct W { T t; } (int,int) ww;
                 struct P pp;
 …
+}
 forall( otype T | { T ?+?( T, T ); } ) forall( otype S | { T ?+?( T, S ); } )
+forall( T | { T ?+?( T, T ); } ) forall( S | { T ?+?( T, S ); } )
 struct XW { T t; };
 XW(int,int) xww;
 forall( otype T ) struct S { T t; } (int) x, y, z;
 forall( otype T ) struct { T t; } (int) a, b, c;
 forall( otype T ) static forall( otype S ) {
     forall( otype X ) struct U {
+forall( T ) struct S { T t; } (int) x, y, z;
+forall( T ) struct { T t; } (int) a, b, c;
+forall( T ) static forall( S ) {
+    forall( X ) struct U {
                 T x;
     };
+}
 forall( otype T ) {
+forall( T ) {
         extern "C" {
                 struct SS { T t; };

tests/function-operator.cfa

-              r342af53
+              r8e4aa05
 // STL-like Algorithms
 trait Assignable(dtype T, dtype U) { T ?=?(T &, U); };
 trait Copyable(dtype T) { void ?{}(T &, T); };
 trait Destructable(dtype T) { void ^?{}(T &); };
+trait Assignable(T &, U &) { T ?=?(T &, U); };
+trait Copyable(T &) { void ?{}(T &, T); };
+trait Destructable(T &) { void ^?{}(T &); };
 trait Iterator(dtype iter | sized(iter) | Copyable(iter) | Destructable(iter), otype T) {
+trait Iterator(iter & | sized(iter) | Copyable(iter) | Destructable(iter), T) {
         T & *?(iter);
         iter ++?(iter &);
 …
 };
 forall(otype Tin, dtype Input | Iterator(Input, Tin), otype Tout, dtype Output | Iterator(Output, Tout) | Assignable(Tout, Tin))
+forall(Tin, Input & | Iterator(Input, Tin), Tout, Output & | Iterator(Output, Tout) | Assignable(Tout, Tin))
 Output copy(Input first, Input last, Output result) {
         while (first != last) {
 …
 // test ?()(T *, ...) -- ?() with function call-by-pointer
 forall(otype Tin, dtype Input | Iterator(Input, Tin), otype Tout, dtype Output | Iterator(Output, Tout), otype FuncRet, dtype Func | { FuncRet ?()(Func *, Tin); } | Assignable(Tout, FuncRet))
+forall(Tin, Input & | Iterator(Input, Tin), Tout, Output & | Iterator(Output, Tout), FuncRet, Func & | { FuncRet ?()(Func *, Tin); } | Assignable(Tout, FuncRet))
 Output transform (Input first, Input last, Output result, Func * op) {
         while (first != last) {
 …
 // test ?()(T, ...) -- ?() with function call-by-value
 forall(dtype Iter, otype T | Iterator(Iter, T), otype Pred | { int ?()(Pred, T); })
+forall(Iter &, T | Iterator(Iter, T), Pred | { int ?()(Pred, T); })
 Iter find_if (Iter first, Iter last, Pred pred) {
         while (first != last) {
 …
 // test ?()(T, ...) -- ?() with function call-by-reference
 forall(otype Generator, otype GenRet | { GenRet ?()(Generator &); }, dtype Iter, otype T | Iterator(Iter, T) | Assignable(T, GenRet))
+forall(Generator, GenRet | { GenRet ?()(Generator &); }, Iter &, T | Iterator(Iter, T) | Assignable(T, GenRet))
 void generate(Iter first, Iter last, Generator & gen) {
         int i = 0;
 …
+}
 forall(otype T | { int ?==?(T, T); })
+forall(T | { int ?==?(T, T); })
 struct Equals {
         T val;
 };
 forall(otype T | { int ?==?(T, T); })
+forall(T | { int ?==?(T, T); })
 int ?()(Equals(T) eq, T x) {
         return eq.val == x;
+}
 forall(otype T | { T ?*?(T, T); })
+forall(T | { T ?*?(T, T); })
 struct Multiply {
         T val;
 };
 forall(otype T | { T ?*?(T, T); })
+forall(T | { T ?*?(T, T); })
 T ?()(Multiply(T) * mult, T x) {
         return mult->val * x;
 …
 // TODO: generalize to ttype return; doesn't work yet
 // like std::function
 forall(otype Return, ttype Args)
+forall(Return, Args...)
 struct function {
         Return (*f)(Args);

tests/genericUnion.cfa

-              r342af53
+              r8e4aa05
 #include <limits.hfa>
 forall(otype T)
+forall(T)
 union ByteView {
         T val;
 …
 };
 forall(otype T)
+forall(T)
 void print(ByteView(T) x) {
         for (int i = 0; i < sizeof(int); i++) {                         // want to change to sizeof(T)
 …
+}
 forall(otype T)
+forall(T)
 void f(ByteView(T) x, T val) {
         print(x);

tests/global-monomorph.cfa

-              r342af53
+              r8e4aa05
 // Create monomorphic instances of polymorphic types at global scope.
 forall(dtype T)
+forall(T &)
 void poly0(T &) {}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 void poly1(T &) {}
 forall(otype T)
+forall(T)
 void poly2(T &) {}

tests/identity.cfa

r342af53	r8e4aa05
16	16	#include <fstream.hfa>
17	17
18		forall( ~~otype~~ T )
	18	forall( T )
19	19	T identity( T t ) {
20	20	return t;

tests/init1.cfa

-              r342af53
+              r8e4aa05
+}
 forall (dtype T, dtype S)
+forall (T &, S &)
 T & anycvt( S & s ) {
     return s;               // mismatched referenced type
+}
 forall (dtype T, dtype S)
+forall (T &, S &)
 T * anycvt( S * s ) {
     return s;               // mismatched referenced type

tests/io/.expect/io1.oast.txt

-              r342af53
+              r8e4aa05
-6 28 0 7 1 2
-1 2 3
 opening delimiters
 x (1 x [2 x {3 x =4 x $5 x £6 x ¥7 x ¡8 x ¿9 x «10
 closing delimiters
 , x 2. x 3; x 4! x 5? x 6% x 7 ¢ x 8 » x 9) x 10] x 11} x
+, x 2. x 3; x 4! x 5? x 6% x 7¢ x 8» x 9) x 10] x 11} x
 opening/closing delimiters
 …
 x ( 1 ) x 2 , x 3 :x: 4
+spacing
+1 2 3
+expressions
+6 28 0 7 1 2

tests/io/io1.cfa

-              r342af53
+              r8e4aa05
 // Created On       : Wed Mar  2 16:56:02 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Mar  4 21:42:47 2019
 // Update Count     : 115
+// Last Modified On : Sun Feb 21 10:07:07 2021
+// Update Count     : 119
 //
 …
 int main() {
+        int x = 3, y = 5, z = 7;
+        sout | x * 3 | y + 1 | z << 2 | x == y | (x | y) | (x || y) | (x > z ? 1 : 2);
+        sout | 0 | 1 | 2 | 3;
+        sout | '0' | '1' | '2' | '3';
+        sout | 0 | "" | 1 | "" | 2 | "" | 3;
+        sout | nl;
+        sout | nlOff;                                                                           // auto nl off
-        sout | nlOff;
         sout | "opening delimiters" | nl;
         sout | "x (" | 1;
 …
         sout | nl | nl;
+        sout | nlOn;
+        sout | nlOn;                                                                            // auto nl on
         sout | "override opening/closing delimiters";
         sout | "x ( " | 1 | " ) x" | 2 | " , x" | 3 | " :x: " | 4;
         sout | nl;
+        sout | "spacing";
+        sout | 0 | 1 | 2 | 3;
+        sout | '0' | '1' | '2' | '3';
+        sout | 0 | "" | 1 | "" | 2 | "" | 3;
+        sout | nl;
+        sout | "expressions";
+        int x = 3, y = 5, z = 7;
+        sout | x * 3 | y + 1 | z << 2 | x == y | (x | y) | (x || y) | (x > z ? 1 : 2);
+}

tests/nested-types.cfa

r342af53	r8e4aa05
16	16	typedef int N;
17	17	struct A {
18		forall(~~otype~~ T)
	18	forall(T)
19	19	struct N {
20	20	T x;

tests/poly-d-cycle.cfa

-              r342af53
+              r8e4aa05
 // Check that a cycle of polymorphic dtype structures can be instancated.
 forall(dtype T)
+forall(T &)
 struct func_table;
 forall(dtype U)
+forall(U &)
 struct object {
         func_table(U) * virtual_table;
 };
 forall(dtype T)
+forall(T &)
 struct func_table {
         void (*object_func)(object(T) *);

tests/poly-o-cycle.cfa

-              r342af53
+              r8e4aa05
 // Check that a cycle of polymorphic otype structures can be instancated.
 forall(otype T)
+forall(T)
 struct func_table;
 forall(otype U)
+forall(U)
 struct object {
         func_table(U) * virtual_table;
 };
 forall(otype T)
+forall(T)
 struct func_table {
         void (*object_func)(object(T) *);

tests/polymorphism.cfa

-              r342af53
+              r8e4aa05
 #include <fstream.hfa>
 forall(otype T)
+forall(T)
 T f(T x, T y) {
         x = y;
 …
+}
 forall(otype T) T ident(T x) {
+forall(T) T ident(T x) {
         return x;
+}
 forall( otype T, otype U )
+forall( T, U )
 size_t struct_size( T i, U j ) {
         struct S { T i; U j; };
 …
+}
 forall( otype T, otype U )
+forall( T, U )
 size_t union_size( T i, U j ) {
         union B { T i; U j; };
 …
 // perform some simple operations on aggregates of T and U
 forall( otype T | { void print(T); int ?==?(T, T); }, otype U | { void print(U); U ?=?(U&, zero_t); } )
+forall( T | { void print(T); int ?==?(T, T); }, U | { void print(U); U ?=?(U&, zero_t); } )
 U foo(T i, U j) {
         struct S { T i; U j; };

tests/raii/ctor-autogen.cfa

-              r342af53
+              r8e4aa05
 // dtype-static generic type is otype
 forall(dtype T)
+forall(T &)
 struct DtypeStaticStruct {
   T * data;
 …
 };
 forall(dtype T)
+forall(T &)
 union DtypeStaticUnion {
   T * data;
 …
 // dynamic generic type is otype
 forall(otype T)
+forall(T)
 struct DynamicStruct {
         T x;
 };
 forall(otype T)
+forall(T)
 union DynamicUnion {
         T x;
 …
 forall(otype T)
+forall(T)
 T identity(T x) { return x; }

tests/simpleGenericTriple.cfa

-              r342af53
+              r8e4aa05
 //
 forall(otype T)
+forall(T)
 struct T3 {
         T f0, f1, f2;
 };
 forall(otype T | { T ?+?(T, T); })
+forall(T | { T ?+?(T, T); })
 T3(T) ?+?(T3(T) x, T3(T) y) {
         T3(T) z = { x.f0+y.f0, x.f1+y.f1, x.f2+y.f2 };

tests/smart-pointers.cfa

-              r342af53
+              r8e4aa05
 #include <memory.hfa>
 #include <stdlib.hfa>
+#include <assert.h>
 void counter_test(void) {
 …
+}
+void declare_test(void) {
+        counter_ptr(int) ptr_i0 = 3;
+        counter_ptr(char) ptr_c0 = 'a';
+        counter_ptr(float) ptr_f0 = 3.5f;
+        counter_ptr(double) ptr_d0 = 3.5;
+        unique_ptr(int) ptr_i1 = 3;
+        unique_ptr(char) ptr_c1 = 'a';
+        unique_ptr(float) ptr_f1 = 3.5f;
+        unique_ptr(double) ptr_d1 = 3.5;
+}
 int main(int argc, char * argv[]) {
         counter_test();
         unique_test();
         pointer_equality();
+        printf("done\n");
+}

tests/sum.cfa

-              r342af53
+              r8e4aa05
 #include <stdlib.hfa>
 trait sumable( otype T ) {
+trait sumable( T ) {
         void ?{}( T &, zero_t );                                                        // 0 literal constructor
         T ?+?( T, T );                                                                          // assortment of additions
 …
 }; // sumable
 forall( otype T | sumable( T ) )                                                // use trait
+forall( T | sumable( T ) )                                              // use trait
 T sum( size_t size, T a[] ) {
         T total = 0;                                                                            // initialize by 0 constructor
 …
                  | sum( size, (S *)a ) | ", check" | (S)s;
         forall( otype Impl | sumable( Impl ) )
+        forall( Impl | sumable( Impl ) )
         struct GS {
                 Impl * x, * y;
 …
                  sum( size, (S *)a ).[i, j], s.[i, j] );
         forall( otype Impl | sumable( Impl ) )
+        forall( Impl | sumable( Impl ) )
         struct GS {
                 Impl * x, * y;

tests/tuple/tuplePolymorphism.cfa

-              r342af53
+              r8e4aa05
 // ensure that f is a viable candidate for g, even though its parameter structure does not exactly match
 [A] f([A, B] x, B y) { printf("%g %c %g %lld %c %lld %lld %c %lld\n", x.0.[x,y,z], x.1.[x,y,z], y.[x,y,z]); return x.0; }
 forall(otype T, otype U | { T f(T, U, U); })
+forall(T, U | { T f(T, U, U); })
 void g(T x, U y) { f(x, y, y); }
 // add two triples
 forall(otype T | { T ?+?(T, T); })
+forall(T | { T ?+?(T, T); })
 [T, T, T] ?+?([T, T, T] x, [T, T, T] y) {
         return [x.0+y.0, x.1+y.1, x.2+y.2];
 …
+}
 forall(otype T)
+forall(T)
 [T, T] foo([T, T] y) {
         [T, T] x;

tests/tuple/tupleVariadic.cfa

-              r342af53
+              r8e4aa05
         printf("called func(void)\n");
+}
 forall(otype T, ttype Params | { void process(T); void func(Params); })
+forall(T, Params... | { void process(T); void func(Params); })
 void func(T arg1, Params p) {
         process(arg1);
 …
+}
 forall(otype T)
+forall(T)
 T * copy(T x) {
         // test calling new inside a polymorphic function
 …
+}
 forall(ttype T | { void foo(T); }) void bar(T x) {}
+forall(T... | { void foo(T); }) void bar(T x) {}
 void foo(int) {}

tests/zombies/ArrayN.c

r342af53	r8e4aa05
6	6	// }
7	7
8		forall(~~otype~~ index_t)
	8	forall(index_t)
9	9	index_t offset_to_index(unsigned offset, index_t size) {
10	10	return [offset / size.0, offset % size.1];

tests/zombies/Members.c

-              r342af53
+              r8e4aa05
 int ?=?( int*, int );
 float ?=?( float*, float );
 forall( dtype DT ) DT * ?=?( DT**, DT* );
 forall(otype T) lvalue T *?( T* );
+forall( DT & ) DT * ?=?( DT**, DT* );
+forall(T) lvalue T *?( T* );
 char *__builtin_memcpy();

tests/zombies/Rank2.c

-              r342af53
+              r8e4aa05
 int ?=?( int &, int );
 forall(dtype DT) DT * ?=?( DT *&, DT * );
+forall(DT &) DT * ?=?( DT *&, DT * );
 void a() {
         forall( otype T ) void f( T );
         void g( forall( otype U ) void p( U ) );
+        forall( T ) void f( T );
+        void g( forall( U ) void p( U ) );
         g( f );
+}
 …
 void g() {
         void h( int *null );
         forall( otype T ) T id( T );
+        forall( T ) T id( T );
 //      forall( dtype T ) T *0;
 //      int 0;

tests/zombies/abstype.c

-              r342af53
+              r8e4aa05
+}
 forall( otype T ) T *?( T * );
+forall( T ) T *?( T * );
 int ?++( int * );
 int ?=?( int *, int );
 forall( dtype DT ) DT * ?=?( DT **, DT * );
+forall( DT & ) DT * ?=?( DT **, DT * );
 otype U = int *;

tests/zombies/context.cfa

-              r342af53
+              r8e4aa05
 // trait declaration
 trait has_q( otype T ) {
+trait has_q( T ) {
         T q( T );
 };
 forall( otype z | has_q( z ) ) void f() {
         trait has_r( otype T, otype U ) {
+forall( z | has_q( z ) ) void f() {
+        trait has_r( T, U ) {
                 T r( T, T (T,U) );
         };

tests/zombies/gc_no_raii/bug-repro/blockers/explicit_cast.c

-              r342af53
+              r8e4aa05
 };
 forall(otype T)
+forall(T)
 struct gcpointer
+{
 …
 };
 forall(otype T)
+forall(T)
 static inline gcpointer(T) gcmalloc()
+{

tests/zombies/gc_no_raii/bug-repro/blockers/recursive_realloc.c

-              r342af53
+              r8e4aa05
 #include <stdlib.hfa>
 trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
+{
         void realloc(allocator_t* const, size_t);
 };
 forall(otype T)
+forall(T)
 struct heap_allocator
+{
 …
 };
 forall(otype T)
+forall(T)
 inline void realloc(heap_allocator(T) *const this, size_t size)
+{

tests/zombies/gc_no_raii/bug-repro/deref.c

-              r342af53
+              r8e4aa05
     forall(otype T)
+    forall(T)
     struct wrap
+    {
 …
     };
     forall(otype T)
+    forall(T)
     T *? (wrap(T) rhs)
+    {

tests/zombies/gc_no_raii/bug-repro/field.c

-              r342af53
+              r8e4aa05
 //------------------------------------------------------------------------------
 //Declaration
 trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
+{
         void ctor(allocator_t* const);
 …
 };
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 struct vector
+{

tests/zombies/gc_no_raii/bug-repro/malloc.c

r342af53	r8e4aa05
1		forall(~~otype~~ T)
	1	forall(T)
2	2	struct wrapper
3	3	{
…	…
5	5	};
6	6
7		forall(~~otype~~ T)
	7	forall(T)
8	8	void ctor(wrapper(T)* this)
9	9	{
…	…
11	11	}
12	12
13		forall(~~otype~~ T)
	13	forall(T)
14	14	wrapper(T) gcmalloc()
15	15	{
…	…
19	19	}
20	20
21		forall(~~otype~~ T)
	21	forall(T)
22	22	wrapper(T)* ?=? (wrapper(T)* lhs, wrapper(T)* rhs)
23	23	{

tests/zombies/gc_no_raii/bug-repro/oddtype.c

-              r342af53
+              r8e4aa05
 forall(dtype T)
+forall(T &)
 struct wrap {
         int i;
 };
 forall(otype T) void ?{}(wrap(T)* this) {}
 forall(otype T) void ?=?(wrap(T)* this) {}
 forall(otype T) void ^?{}(wrap(T)* this) {}
+forall(T) void ?{}(wrap(T)* this) {}
+forall(T) void ?=?(wrap(T)* this) {}
+forall(T) void ^?{}(wrap(T)* this) {}
 struct List_t {

tests/zombies/gc_no_raii/bug-repro/push_back.h

-              r342af53
+              r8e4aa05
 //------------------------------------------------------------------------------
 //Declaration
 trait allocator_c(otype T, otype allocator_t) {
+trait allocator_c(T, allocator_t) {
         void ctor(allocator_t* const);
         void dtor(allocator_t* const);
 …
 };
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 struct vector
+{
 …
 //------------------------------------------------------------------------------
 //Initialization
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void vector_ctor(vector(T, allocator_t) *const this);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void dtor(vector(T, allocator_t) *const this);
 //------------------------------------------------------------------------------
 //Allocator
 forall(otype T)
+forall(T)
 struct heap_allocator
+{
 …
 };
 forall(otype T)
+forall(T)
 void ctor(heap_allocator(T) *const this);
 forall(otype T)
+forall(T)
 void dtor(heap_allocator(T) *const this);
 forall(otype T)
+forall(T)
 void realloc(heap_allocator(T) *const this, size_t size);
 forall(otype T)
+forall(T)
 inline T* data(heap_allocator(T) *const this)
+{
 …
 //------------------------------------------------------------------------------
 //Capacity
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline bool empty(vector(T, allocator_t) *const this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline bool size(vector(T, allocator_t) *const this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline void reserve(vector(T, allocator_t) *const this, size_t size)
+{
 …
 //------------------------------------------------------------------------------
 //Modifiers
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void push_back(vector(T, allocator_t) *const this, T value);

tests/zombies/gc_no_raii/bug-repro/realloc.c

-              r342af53
+              r8e4aa05
 void* realloc(void*, unsigned long int);
 forall(otype T)
+forall(T)
 struct wrap
+{
 …
 };
 forall(otype T)
+forall(T)
 static inline void realloc(wrap(T) *const this, unsigned long int size)
+{

tests/zombies/gc_no_raii/bug-repro/return.c

r342af53	r8e4aa05
1		forall(~~otype~~ T)
	1	forall(T)
2	2	struct wrapper
3	3	{
…	…
5	5	};
6	6
7		forall(~~otype~~ T)
	7	forall(T)
8	8	wrapper(T) create()
9	9	{
…	…
12	12	}
13	13
14		forall(~~otype~~ T)
	14	forall(T)
15	15	wrapper(T)* ?=?(wrapper(T)* lhs, wrapper(T)* rhs)
16	16	{

tests/zombies/gc_no_raii/bug-repro/return_template.c

-              r342af53
+              r8e4aa05
 forall(otype T)
+forall(T)
 struct wrap
+{
 …
 };
 forall(otype T) void ?{}(wrap(T)* this);
 forall(otype T) void ?{}(wrap(T)* this, wrap(T)* rhs);
 forall(otype T) void ^?{}(wrap(T)* this);
 forall(otype T) void ?=?(wrap(T)* this, wrap(T)* rhs);
+forall(T) void ?{}(wrap(T)* this);
+forall(T) void ?{}(wrap(T)* this, wrap(T)* rhs);
+forall(T) void ^?{}(wrap(T)* this);
+forall(T) void ?=?(wrap(T)* this, wrap(T)* rhs);
 forall(otype T)
+forall(T)
 wrap(T) test()
+{

tests/zombies/gc_no_raii/bug-repro/slow_malloc.c

r342af53	r8e4aa05
1	1	#include <stdlib.hfa>
2	2
3		forall(~~otype~~ T)
	3	forall(T)
4	4	struct heap_allocator
5	5	{

tests/zombies/gc_no_raii/bug-repro/zero.c

-              r342af53
+              r8e4aa05
 forall(otype T)
+forall(T)
 struct wrap
+{
 …
 };
 forall(otype T)
+forall(T)
 int ?==? (wrap(T) lhs, wrap(T) rhs)
+{
 …
 struct wrap(int) 0;
 /*/
 forall(otype T)
+forall(T)
 struct wrap(T) 0;
 //*/

tests/zombies/gc_no_raii/src/gc.h

r342af53	r8e4aa05
13	13	// }
14	14
15		forall(~~otype~~ T)
	15	forall(T)
16	16	static inline void gcmalloc(gcpointer(T)* ptr)
17	17	{

tests/zombies/gc_no_raii/src/gcpointers.c

-              r342af53
+              r8e4aa05
 #endif
 forall(otype T) void ?{}(gcpointer(T)* this) {
+forall(T) void ?{}(gcpointer(T)* this) {
         (&this->internal) {};
+}
 forall(otype T) void ?{}(gcpointer(T)* this, void* address) {
+forall(T) void ?{}(gcpointer(T)* this, void* address) {
         (&this->internal) { address };
+}
 forall(otype T) void ?{}(gcpointer(T)* this, gcpointer(T) other) {
+forall(T) void ?{}(gcpointer(T)* this, gcpointer(T) other) {
         (&this->internal) { other.internal };
+}
 forall(otype T) void ^?{}(gcpointer(T)* this) {
+forall(T) void ^?{}(gcpointer(T)* this) {
         ^?{}(&this->internal);
+}
 forall(otype T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs) {
+forall(T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs) {
         this->internal = rhs.internal;
         return *this;
 …
 // forall(otype T) T *?(gcpointer(T) this);
 forall(otype T) T* get(gcpointer(T)* this) {
+forall(T) T* get(gcpointer(T)* this) {
         return (T*)this->internal.ptr;
+}
 //
 // //Logical operators
 forall(otype T) int ?!=?(gcpointer(T) this, int zero) {
+forall(T) int ?!=?(gcpointer(T) this, int zero) {
         return this.internal.ptr != 0;
+}

tests/zombies/gc_no_raii/src/gcpointers.h

-              r342af53
+              r8e4aa05
 #include <stdint.h>
 forall(dtype T)
+forall(T &)
 struct gcpointer;
 …
 #endif
 forall(dtype T)
+forall(T &)
 struct gcpointer
+{
 …
 //
 forall(otype T) void ?{}(gcpointer(T)* this);
 forall(otype T) void ?{}(gcpointer(T)* this, void* address);
 forall(otype T) void ?{}(gcpointer(T)* this, gcpointer(T) other);
 forall(otype T) void ^?{}(gcpointer(T)* this);
 forall(otype T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs);
+forall(T) void ?{}(gcpointer(T)* this);
+forall(T) void ?{}(gcpointer(T)* this, void* address);
+forall(T) void ?{}(gcpointer(T)* this, gcpointer(T) other);
+forall(T) void ^?{}(gcpointer(T)* this);
+forall(T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs);
 // forall(otype T) T *?(gcpointer(T) this);
 forall(otype T) T* get(gcpointer(T)* this);
+forall(T) T* get(gcpointer(T)* this);
 //Logical operators
 forall(otype T) int ?!=?(gcpointer(T) this, int zero);
 forall(otype T) int ?!=?(gcpointer(T) this, gcpointer(T) rhs);
 forall(otype T) int ?==?(gcpointer(T) this, gcpointer(T) rhs);
+forall(T) int ?!=?(gcpointer(T) this, int zero);
+forall(T) int ?!=?(gcpointer(T) this, gcpointer(T) rhs);
+forall(T) int ?==?(gcpointer(T) this, gcpointer(T) rhs);

tests/zombies/gc_no_raii/src/tools.h

-              r342af53
+              r8e4aa05
 // }
 trait has_equal(otype T)
+trait has_equal(T)
+{
         signed int ?==?(T a, T b);
 };
 trait InputIterator_t(otype T, otype InputIterator)
+trait InputIterator_t(T, InputIterator)
+{
         signed int ?==?(InputIterator a, InputIterator b);
 …
 };
 forall(otype T | has_equal(T), otype InputIterator | InputIterator_t(T, InputIterator))
+forall(T | has_equal(T), InputIterator | InputIterator_t(T, InputIterator))
 static inline InputIterator find( InputIterator first, const InputIterator* const last, T val)
+{

tests/zombies/hashtable.cfa

-              r342af53
+              r8e4aa05
 trait has_hash( otype K ) {
+trait has_hash( K ) {
     size_t hash(K);
     int ?==?( K, K );
 };
 trait hkey( otype K, dtype tN | has_hash(K) ) {
+trait hkey( K, tN & | has_hash(K) ) {
     K key(tN &);
 };
 forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) ) {
     struct hashtable {
 …
+}
 forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
     void ?{}( hashtable(K, tN, tE) & this, size_t n_buckets, dlist(tN, tE) *buckets ) {
 …
+}
 forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) ) {
     float fill_frac( hashtable(K, tN, tE) & this ) with(this) {
 …
 trait heaped(dtype T) {
+trait heaped(T &) {
     T * alloc( size_t );
     void free( void * );
 …
+}
 forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) | heaped( dlist(tN, tE) ) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) | heaped( dlist(tN, tE) ) ) {
     struct hashtable_dynamic {

tests/zombies/hashtable2.cfa

-              r342af53
+              r8e4aa05
 trait pretendsToMatter( dtype TTT ) {
+trait pretendsToMatter( TTT & ) {
     void actsmart(TTT &);
 };
 forall( dtype TTTx )
+forall( TTTx & )
 void actsmart(TTTx &) {}
 …
 //   2. shows up in -CFA output as hashtable_rbs(), which is bad C; expecting hashtable_rbs*
 forall( otype Tt_unused | pretendsToMatter(Tt_unused) ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) ) {
     // hashtable of request by source
 …
+}
 forall( otype Tt_unused | pretendsToMatter(Tt_unused) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
     void ?{}( hashtable_rbs(Tt_unused) & this, size_t n_buckets, dlist(request_in_ht_by_src, request) *buckets,
 …
 void defaultResumptionHandler( ht_auto_resize_pending & ex );
 forall( otype Tt_unused | pretendsToMatter(Tt_unused) ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) ) {
     float fill_frac( hashtable_rbs(Tt_unused) & this ) with(this) {
 …
 trait heaped(dtype T) {
+trait heaped(T &) {
     T * alloc( size_t );
     void free( void * );
 …
 void __dynamic_defaultResumptionHandler(ht_fill_limit_crossed &);
 forall( otype Tt_unused ) {
+forall( Tt_unused ) {
     struct hashtable_rbs_dynamic {
 …
 forall( otype Tt_unused | heaped( dlist(request_in_ht_by_src, request) ) ) {
+forall( Tt_unused | heaped( dlist(request_in_ht_by_src, request) ) ) {
     void ?{}( hashtable_rbs_dynamic(Tt_unused).resize_policy & this, size_t nbuckets_floor ) {
 …
+}
 forall( otype Tt_unused ) {
+forall( Tt_unused ) {
     void rehashToLarger_STEP( hashtable_rbs_dynamic(Tt_unused) & this, size_t new_n_buckets ) with (this) {
         rehashToLarger( this, new_n_buckets );

tests/zombies/huge.c

r342af53	r8e4aa05
14	14	//
15	15
16		int huge( int n, forall( ~~otype~~ T ) T (*f)( T ) ) {
	16	int huge( int n, forall( T ) T (*f)( T ) ) {
17	17	if ( n <= 0 )
18	18	return f( 0 );

tests/zombies/it_out.c

-              r342af53
+              r8e4aa05
 typedef unsigned long streamsize_type;
 trait ostream( dtype os_type ) {
+trait ostream( os_type & ) {
         os_type *write( os_type *, const char *, streamsize_type );
         int fail( os_type * );
 };
 trait writeable( otype T ) {
         forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, T );
+trait writeable( T ) {
+        forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, T );
 };
 forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, char );
 forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, int );
 forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, const char * );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, char );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, int );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, const char * );
 trait istream( dtype is_type ) {
+trait istream( is_type & ) {
         is_type *read( is_type *, char *, streamsize_type );
         is_type *unread( is_type *, char );
 …
 };
 trait readable( otype T ) {
         forall( dtype is_type | istream( is_type ) ) is_type * ?<<?( is_type *, T );
+trait readable( T ) {
+        forall( is_type & | istream( is_type ) ) is_type * ?<<?( is_type *, T );
 };
 forall( dtype is_type | istream( is_type ) ) is_type * ?>>?( is_type *, char* );
 forall( dtype is_type | istream( is_type ) ) is_type * ?>>?( is_type *, int* );
+forall( is_type & | istream( is_type ) ) is_type * ?>>?( is_type *, char* );
+forall( is_type & | istream( is_type ) ) is_type * ?>>?( is_type *, int* );
 trait iterator( otype iterator_type, otype elt_type ) {
+trait iterator( iterator_type, elt_type ) {
         iterator_type ?++( iterator_type* );
         iterator_type ++?( iterator_type* );
 …
 };
 forall( otype elt_type | writeable( elt_type ),
                 otype iterator_type | iterator( iterator_type, elt_type ),
                 dtype os_type | ostream( os_type ) )
+forall( elt_type | writeable( elt_type ),
+                iterator_type | iterator( iterator_type, elt_type ),
+                os_type & | ostream( os_type ) )
 void write_all( iterator_type begin, iterator_type end, os_type *os );
 forall( otype elt_type | writeable( elt_type ),
                 otype iterator_type | iterator( iterator_type, elt_type ),
                 dtype os_type | ostream( os_type ) )
+forall( elt_type | writeable( elt_type ),
+                iterator_type | iterator( iterator_type, elt_type ),
+                os_type & | ostream( os_type ) )
 void write_all( elt_type begin, iterator_type end, os_type *os ) {
         os << begin;

tests/zombies/new.c

r342af53	r8e4aa05
14	14	//
15	15
16		forall( ~~otype~~ T )
	16	forall( T )
17	17	void f( T *t ) {
18	18	t--;

tests/zombies/occursError.cfa

-              r342af53
+              r8e4aa05
 forall( otype T ) void f( void (*)( T, T * ) );
 forall( otype U ) void g( U,  U * );
 forall( otype U ) void h( U *, U );
+forall( T ) void f( void (*)( T, T * ) );
+forall( U ) void g( U,  U * );
+forall( U ) void h( U *, U );
 void test() {

tests/zombies/prolog.c

-              r342af53
+              r8e4aa05
 void is_integer( int x ) {}
 trait ArithmeticType( otype T ) {
+trait ArithmeticType( T ) {
         void is_arithmetic( T );
 };
 trait IntegralType( otype T | ArithmeticType( T ) ) {
+trait IntegralType( T | ArithmeticType( T ) ) {
         void is_integer( T );
 };
 forall( otype T | IntegralType( T ) | { void printResult( T ); } )
+forall( T | IntegralType( T ) | { void printResult( T ); } )
 void hornclause( T param ) {
         printResult( param );

tests/zombies/quad.c

-              r342af53
+              r8e4aa05
 #include <fstream.hfa>
 forall( otype T | { T ?*?( T, T ); } )
+forall( T | { T ?*?( T, T ); } )
 T square( T t ) {
         return t * t;
+}
 forall( otype U | { U square( U ); } )
+forall( U | { U square( U ); } )
 U quad( U u ) {
         return square( square( u ) );

tests/zombies/scope.cfa

-              r342af53
+              r8e4aa05
 y p;
 trait has_u( otype z ) {
+trait has_u( z ) {
         z u(z);
 };
 forall( otype t | has_u( t ) )
+forall( t | has_u( t ) )
 y q( t the_t ) {
         t y = u( the_t );

tests/zombies/simplePoly.c

r342af53	r8e4aa05
14	14	//
15	15
16		forall( ~~otype T, otype~~ U \| { T f( T, U ); } )
	16	forall( T, U \| { T f( T, U ); } )
17	17	T q( T t, U u ) {
18	18	return f( t, u );

tests/zombies/simpler.c

r342af53	r8e4aa05
14	14	//
15	15
16		forall( ~~otype~~ T ) T id( T, T );
	16	forall( T ) T id( T, T );
17	17
18	18	int main() {

tests/zombies/specialize.c

r342af53	r8e4aa05
39	39	}
40	40
41		forall( ~~otype~~ T ) T f( T t )
	41	forall( T ) T f( T t )
42	42	{
43	43	printf( "in f; sizeof T is %d\n", sizeof( T ) );

tests/zombies/square.c

r342af53	r8e4aa05
16	16	#include <fstream.hfa>
17	17
18		forall( ~~otype~~ T \| { T ?*?( T, T ); } )
	18	forall( T \| { T ?*?( T, T ); } )
19	19	T square( T t ) {
20	20	return t * t;

tests/zombies/structMember.cfa

r342af53	r8e4aa05
66	66	S.T;
67	67	.S.T;
68		forall( ~~otype S, otype~~ T ) struct W {
	68	forall( S, T ) struct W {
69	69	struct X {};
70	70	};

tests/zombies/subrange.cfa

-              r342af53
+              r8e4aa05
 // A small context defining the notion of an ordered otype.  (The standard
 // library should probably contain a context for this purpose.)
 trait ordered(otype T) {
+trait ordered(T) {
     int ?<?(T, T), ?<=?(T, T);
 };
 …
 // A subrange otype resembling an Ada subotype with a base otype and a range
 // constraint.
 otype subrange(otype base_t | ordered(base_t), base_t low = 0, base_t high = 8) = base_t;
+otype subrange(base_t | ordered(base_t), base_t low = 0, base_t high = 8) = base_t;
 // Note that subrange() can be applied to floating-point and pointer otypes, not
 …
 // Convenient access to subrange bounds, for instance for iteration:
 forall (otype T, T low, T high)
+forall (T, T low, T high)
 T lbound( subrange(T, low, high) v) {
     return low;
+}
 forall (otype T, T low, T high)
+forall (T, T low, T high)
 T hbound( subrange(T, low, high) v) {
     return high;
 …
 // of exception handling here.  Inlining allows the compiler to eliminate
 // bounds checks.
 forall (otype T | ordered(T), T low, T high)
+forall (T | ordered(T), T low, T high)
 inline subrange(T, low, high) ?=?(subrange(T, low, high)* target, T source) {
     if (low <= source && source <= high) *((T*)target) = source;
 …
 // compares range bounds so that the compiler can optimize checks away when the
 // ranges are known to overlap.
 forall (otype T | ordered(T), T t_low, T t_high, T s_low, T s_high)
+forall (T | ordered(T), T t_low, T t_high, T s_low, T s_high)
 inline subrange(T, t_low, t_high) ?=?(subrange(T, t_low, t_high)* target,
                                       subrange(T, s_low, s_high) source) {

tests/zombies/twice.c

r342af53	r8e4aa05
16	16	#include <fstream.hfa>
17	17
18		forall( ~~otype~~ T \| { T ?+?( T, T ); } )
	18	forall( T \| { T ?+?( T, T ); } )
19	19	T twice( const T t ) {
20	20	return t + t;

tests/zombies/typeGenerator.cfa

-              r342af53
+              r8e4aa05
 context addable( otype T ) {
+context addable( T ) {
         T ?+?( T,T );
         T ?=?( T*, T);
 };
 otype List1( otype T | addable( T ) ) = struct { T data; List1( T ) *next; } *;
+otype List1( T | addable( T ) ) = struct { T data; List1( T ) *next; } *;
 typedef List1( int ) ListOfIntegers;
 //List1( int ) li;
 …
 [int] h( * List1( int ) p );                                                    // new declaration syntax
 struct( otype T ) S2 { T i; };                                                  // actual definition
+struct( T ) S2 { T i; };                                                        // actual definition
 struct( int ) S3 v1, *p;                                                                // expansion and instantiation
 struct( otype T )( int ) S24 { T i; } v2;                               // actual definition, expansion and instantiation
 struct( otype T )( int ) { T i; } v2;                                   // anonymous actual definition, expansion and instantiation
+struct( T )( int ) S24 { T i; } v2;                             // actual definition, expansion and instantiation
+struct( T )( int ) { T i; } v2;                                 // anonymous actual definition, expansion and instantiation
 struct( otype T | addable( T ) ) node { T data; struct( T ) node *next; };
 otype List( otype T ) = struct( T ) node *;
+struct( T | addable( T ) ) node { T data; struct( T ) node *next; };
+otype List( T ) = struct( T ) node *;
 List( int ) my_list;

tests/zombies/withStatement.cfa

-              r342af53
+              r8e4aa05
+}
 forall( otype T )
+forall( T )
 struct Box {
         T x;
 };
 forall( otype T )
+forall( T )
 void ?{}( Box(T) & this ) with( this ) { // with clause in polymorphic function
         x{};
 …
 void print( int i ) { sout | i; }
 forall( otype T | { void print( T ); })
+forall( T | { void print( T ); })
 void foo( T t ) {
         Box( T ) b = { t };

tests/zombies/wrapper/src/pointer.h

r342af53	r8e4aa05
8	8	// type safe malloc / free
9	9
10		forall(~~otype~~ T)
	10	forall(T)
11	11	T* new()
12	12	{
…	…
16	16	}
17	17
18		forall(~~otype~~ T)
	18	forall(T)
19	19	void delete(T* p)
20	20	{

tools/prettyprinter/Makefile.am

-              r342af53
+              r8e4aa05
 ## Created On       : Wed Jun 28 12:07:10 2017
 ## Last Modified By : Peter A. Buhr
 ## Last Modified On : Mon Apr 16 09:43:23 2018
 ## Update Count     : 20
+## Last Modified On : Thu Jan 28 08:48:22 2021
+## Update Count     : 23
 ###############################################################################
 …
 BUILT_SOURCES = parser.hh
 AM_YFLAGS = -d -t -v
+AM_YFLAGS = -d -t -v -Wno-yacc
 SRC = lex.ll \
 …
 pretty_CXXFLAGS = -Wno-deprecated -Wall -DYY_NO_INPUT -O2 -g -std=c++14
 MAINTAINERCLEANFILES = parser.output
+MOSTLYCLEANFILES = parser.output

tools/prettyprinter/ParserTypes.h

-              r342af53
+              r8e4aa05
 // Created On       : Sun Dec 16 15:00:49 2001
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Jul 22 10:13:09 2017
 // Update Count     : 175
+// Last Modified On : Tue Jan 26 23:05:34 2021
+// Update Count     : 176
 //
 #pragma once
 int yylex();
+extern "C" int yylex();
 #include <string>

tools/prettyprinter/parser.yy

-              r342af53
+              r8e4aa05
 // Created On       : Sat Dec 15 13:44:21 2001
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Apr 15 21:40:30 2018
 // Update Count     : 1052
+// Last Modified On : Tue Jan 26 22:50:03 2021
+// Update Count     : 1053
 //
 …
 #define YYDEBUG_LEXER_TEXT( yylval )                                    // lexer loads this up each time
 #define YYDEBUG 1                                                                               // get the pretty debugging code to compile
+#define YYERROR_VERBOSE                                                                 // more information in syntax errors
 #include <iostream>

Context Navigation

Legend: