Changeset d95969a

.gitignore

rb6a8b31	rd95969a
79	79	# generated by npm
80	80	package-lock.json
	81
	82	# generated by benchmark
	83	benchmark/Cargo.toml

benchmark/io/http/filecache.cfa

-              rb6a8b31
+              rd95969a
 #include <string.h>
+#include <fstream.hfa>
 #include <stdlib.hfa>
 …
                 conflicts += put_file( raw[i], fd );
+        }
         printf("Filled cache from path \"%s\" with %zu files\n", path, fcount);
+        sout | "Filled cache from path \"" | path | "\" with" | fcount | "files";
         if( conflicts > 0 ) {
                 printf("Found %d conflicts (seed: %u)\n", conflicts, options.file_cache.hash_seed);
+                sout | "Found" | conflicts | "conflicts (seed: " | options.file_cache.hash_seed | ")";
                 #if defined(REJECT_CONFLICTS)
                         abort("Conflicts found in the cache");
 …
         if(options.file_cache.list) {
                 printf("Listing files and exiting\n");
+                sout | "Listing files and exiting";
                 for(i; fcount) {
                         int s; char u;
                         [s, u] = human_size(raw[i].size);
                         printf("%4d%c - %s\n", s, u, raw[i].file);
+                        sout | s | u | "-" | raw[i].file;
                         free(raw[i].file);
+                }
 …
 [int *, int] filefds(int extra) {
+        if(!options.file_cache.path) {
+                int * data = alloc(extra);
+                return [data, 0];
+        }
         if(!file_cache.entries) {
                 abort("File cache not filled!\n");

benchmark/io/http/main.cfa

-              rb6a8b31
+              rd95969a
 #include <unistd.h>
 extern "C" {
+        #include <signal.h>
         #include <sys/socket.h>
         #include <netinet/in.h>
+}
+#include <fstream.hfa>
 #include <kernel.hfa>
+#include <iofwd.hfa>
 #include <stats.hfa>
 #include <time.hfa>
 …
 //=============================================================================================
+// Stats Printer
+//============================================================================================='
+thread StatsPrinter {};
+void ?{}( StatsPrinter & this ) {
+        ((thread&)this){ "Stats Printer Thread" };
+}
+void main(StatsPrinter & this) {
+        LOOP: for() {
+                waitfor( ^?{} : this) {
+                        break LOOP;
+                }
+                or else {}
+                sleep(10`s);
+                print_stats_now( *options.clopts.instance, CFA_STATS_READY_Q | CFA_STATS_IO );
+        }
+}
+//=============================================================================================
 // Main
 //============================================================================================='
 int main( int argc, char * argv[] ) {
+        __sighandler_t s = 1p;
+        signal(SIGPIPE, s);
         //===================
         // Parse args
         const char * path = parse_options(argc, argv);
+        parse_options(argc, argv);
         //===================
         // Open Files
+        printf("Filling cache from %s\n", path);
+        fill_cache( path );
+        if( options.file_cache.path ) {
+                sout | "Filling cache from" | options.file_cache.path;
+                fill_cache( options.file_cache.path );
+        }
         //===================
         // Open Socket
         printf("%ld : Listening on port %d\n", getpid(), options.socket.port);
+        sout | getpid() | ": Listening on port" | options.socket.port;
         int server_fd = socket(AF_INET, SOCK_STREAM, 0);
         if(server_fd < 0) {
 …
                         if(errno == EADDRINUSE) {
                                 if(waited == 0) {
                                         printf("Waiting for port\n");
+                                        sout | "Waiting for port";
                                 } else {
                                         printf("\r%d", waited);
                                         fflush(stdout);
+                                        sout | "\r" | waited | nonl;
+                                        flush( sout );
+                                }
                                 waited ++;
 …
+                }
                 if(options.file_cache.fixed_fds) {
+                if(options.file_cache.path && options.file_cache.fixed_fds) {
                         register_fixed_files(cl, fds, pipe_off);
+                }
 …
+                {
                         ServerProc procs[options.clopts.nprocs];
+                        StatsPrinter printer;
                         init_protocol();
 …
                                         unpark( workers[i] );
+                                }
                                 printf("%d workers started on %d processors\n", options.clopts.nworkers, options.clopts.nprocs);
+                                sout | options.clopts.nworkers | "workers started on" | options.clopts.nprocs | "processors";
+                                {
                                         char buffer[128];
                                         while(!feof(stdin)) {
                                                 fgets(buffer, 128, stdin);
+                                        while(int ret = cfa_read(0, buffer, 128, 0, -1`s, 0p, 0p); ret != 0) {
+                                                if(ret < 0) abort( "main read error: (%d) %s\n", (int)errno, strerror(errno) );
+                                        }
+                                        printf("Shutting Down\n");
+                                }
+                                        sout | "Shutdown received";
+                                }
+                                sout | "Notifying connections..." | nonl; flush( sout );
                                 for(i; options.clopts.nworkers) {
-                                        printf("Cancelling %p\n", (void*)workers[i].cancel.target);
                                         workers[i].done = true;
                                         cancel(workers[i].cancel);
+                                }
+                                printf("Shutting down socket\n");
+                                sout | "done";
+                                sout | "Shutting down socket..." | nonl; flush( sout );
                                 int ret = shutdown( server_fd, SHUT_RD );
+                                if( ret < 0 ) { abort( "shutdown error: (%d) %s\n", (int)errno, strerror(errno) ); }
+                                if( ret < 0 ) {
+                                        abort( "shutdown error: (%d) %s\n", (int)errno, strerror(errno) );
+                                }
+                                sout | "done";
                                 //===================
                                 // Close Socket
                                 printf("Closing Socket\n");
+                                sout | "Closing Socket..." | nonl; flush( sout );
                                 ret = close( server_fd );
                                 if(ret < 0) {
                                         abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
+                                }
+                                sout | "done";
+                                sout | "Stopping connection threads..." | nonl; flush( sout );
+                        }
+                        printf("Workers Closed\n");
+                        sout | "done";
+                        sout | "Stopping protocol threads..." | nonl; flush( sout );
                         deinit_protocol();
+                }
+                        sout | "done";
+                        sout | "Stopping processors..." | nonl; flush( sout );
+                }
+                sout | "done";
+                sout | "Closing splice fds..." | nonl; flush( sout );
                 for(i; pipe_cnt) {
                         ret = close( fds[pipe_off + i] );
 …
+                }
                 free(fds);
+        }
+                sout | "done";
+                sout | "Stopping processors..." | nonl; flush( sout );
+        }
+        sout | "done";
         //===================
         // Close Files
+        printf("Closing Files\n");
+        close_cache();
+}
+        if( options.file_cache.path ) {
+                sout | "Closing open files..." | nonl; flush( sout );
+                close_cache();
+                sout | "done";
+        }
+}

benchmark/io/http/options.cfa

-              rb6a8b31
+              rd95969a
+}
+#include <bitmanip.hfa>
+#include <fstream.hfa>
 #include <kernel.hfa>
 #include <parseargs.hfa>
+#include <stdlib.h>
 #include <string.h>
 …
         { // file_cache
+,     // path
 ,     // open_flags;
 u,   // hash_seed;
 …
 ,     // nprocs;
 ,     // nworkers;
 ,     // flags;
+                {},     // params;
                 false, // procstats
                 false, // viewhalts
 …
 };
 const char * parse_options( int argc, char * argv[] ) {
+void parse_options( int argc, char * argv[] ) {
         bool subthrd = false;
         bool eagrsub = false;
 …
         bool iokpoll = false;
         unsigned sublen = 16;
+        unsigned nentries = 16;
         static cfa_option opt[] = {
+                {'p', "port",           "Port the server will listen on", options.socket.port},
+                {'c', "cpus",           "Number of processors to use", options.clopts.nprocs},
+                {'L', "log",            "Enable logs", options.log, parse_settrue},
+                {'t', "threads",        "Number of worker threads to use", options.clopts.nworkers},
+                {'b', "accept-backlog", "Maximum number of pending accepts", options.socket.backlog},
+                {'r', "request_len",    "Maximum number of bytes in the http request, requests with more data will be answered with Http Code 414", options.socket.buflen},
+                {'S', "seed",           "seed to use for hashing", options.file_cache.hash_seed },
+                {'C', "cache-size",     "Size of the cache to use, if set to small, will uses closes power of 2", options.file_cache.size },
+                {'l', "list-files",     "List the files in the specified path and exit", options.file_cache.list, parse_settrue },
+                {'s', "submitthread",   "If set, cluster uses polling thread to submit I/O", subthrd, parse_settrue },
+                {'e', "eagersubmit",    "If set, cluster submits I/O eagerly but still aggregates submits", eagrsub, parse_settrue},
+                {'f', "fixed-fds",      "If set, files are open eagerly and pre-registered with the cluster", fixedfd, parse_settrue},
+                {'k', "kpollsubmit",    "If set, cluster uses IORING_SETUP_SQPOLL, implies -f", sqkpoll, parse_settrue },
+                {'i', "kpollcomplete",  "If set, cluster uses IORING_SETUP_IOPOLL", iokpoll, parse_settrue },
+                {'L', "submitlength",   "Max number of submitions that can be submitted together", sublen },
+                { 'p', "port",           "Port the server will listen on", options.socket.port},
+                { 'c', "cpus",           "Number of processors to use", options.clopts.nprocs},
+                { 't', "threads",        "Number of worker threads to use", options.clopts.nworkers},
+                {'\0', "log",            "Enable logs", options.log, parse_settrue},
+                {'\0', "accept-backlog", "Maximum number of pending accepts", options.socket.backlog},
+                {'\0', "request_len",    "Maximum number of bytes in the http request, requests with more data will be answered with Http Code 414", options.socket.buflen},
+                {'\0', "seed",           "seed to use for hashing", options.file_cache.hash_seed },
+                {'\0', "cache-size",     "Size of the cache to use, if set to small, will uses closes power of 2", options.file_cache.size },
+                {'\0', "list-files",     "List the files in the specified path and exit", options.file_cache.list, parse_settrue },
+                { 's', "submitthread",   "If set, cluster uses polling thread to submit I/O", subthrd, parse_settrue },
+                { 'e', "eagersubmit",    "If set, cluster submits I/O eagerly but still aggregates submits", eagrsub, parse_settrue},
+                { 'f', "fixed-fds",      "If set, files are open eagerly and pre-registered with the cluster", fixedfd, parse_settrue},
+                { 'k', "kpollsubmit",    "If set, cluster uses IORING_SETUP_SQPOLL, implies -f", sqkpoll, parse_settrue },
+                { 'i', "kpollcomplete",  "If set, cluster uses IORING_SETUP_IOPOLL", iokpoll, parse_settrue },
+                {'\0', "submitlength",   "Max number of submitions that can be submitted together", sublen },
+                {'\0', "numentries",     "Number of I/O entries", nentries },
         };
 …
         char **left;
         parse_args( argc, argv, opt, opt_cnt, "[OPTIONS]... [PATH]\ncforall http server", left );
+        if( !is_pow2(nentries) ) {
+                unsigned v = nentries;
+                v--;
+                v |= v >> 1;
+                v |= v >> 2;
+                v |= v >> 4;
+                v |= v >> 8;
+                v |= v >> 16;
+                v++;
+                serr | "Warning: num_entries not a power of 2" | '(' | nentries | ')' | "raising to " | v;
+                nentries = v;
+        }
+        options.clopts.params.num_entries = nentries;
         options.clopts.params.poller_submits = subthrd;
 …
         options.clopts.params.num_ready = sublen;
         if( left[0] == 0p ) { return "."; }
+        if( left[0] == 0p ) { return; }
         const char * path = left[0];
 …
         if( left[0] != 0p ) {
+                abort("Too many trailing arguments!\n");
+                serr | "Too many trailing arguments!" | '\'' | path | '\'';
+                while(left[0] != 0p) {
+                        serr | " - " | left[0];
+                        left++;
+                }
+                exit(EXIT_FAILURE);
+        }
         return path;
+        options.file_cache.path = path;
+}

benchmark/io/http/options.hfa

-              rb6a8b31
+              rd95969a
         struct {
+                const char * path;
                 int open_flags;
                 uint32_t hash_seed;
 …
 extern Options options;
 const char * parse_options( int argc, char * argv[] );
+void parse_options( int argc, char * argv[] );

benchmark/io/http/protocol.cfa

-              rb6a8b31
+              rd95969a
         #include <fcntl.h>
+}
+#include <fstream.hfa>
 #include <iofwd.hfa>
 …
 extern "C" {
       int snprintf ( char * s, size_t n, const char * format, ... );
         #include <linux/io_uring.h>
+        // #include <linux/io_uring.h>
+}
 #include <string.h>
 …
         "HTTP/1.1 400 Bad Request\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
         "HTTP/1.1 404 Not Found\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 405 Method Not Allowed\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
+        "HTTP/1.1 408 Request Timeout\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
         "HTTP/1.1 413 Payload Too Large\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
         "HTTP/1.1 414 URI Too Long\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
 …
 ,
 ,
+,
+,
 ,
 ,
 …
                 int ret = cfa_write(fd, it, len, 0, -1`s, 0p, 0p);
                 // int ret = write(fd, it, len);
+                if( ret < 0 ) { if( errno != EAGAIN && errno != EWOULDBLOCK) abort( "'answer error' error: (%d) %s\n", (int)errno, strerror(errno) ); }
+                if( ret < 0 ) {
+                        if( errno == ECONNRESET || errno == EPIPE ) return -ECONNRESET;
+                        if( errno == EAGAIN || errno == EWOULDBLOCK) return -EAGAIN;
+                        abort( "'answer error' error: (%d) %s\n", (int)errno, strerror(errno) );
+                }
                 // update it/len
 …
                 if(ret < 0 ) {
                         if( errno == EAGAIN || errno == EWOULDBLOCK) continue READ;
+                        // if( errno == EINVAL ) return [E400, true, 0, 0];
+                        if( errno == ECONNRESET ) return [E408, true, 0, 0];
+                        if( errno == EPIPE ) return [E408, true, 0, 0];
                         abort( "read error: (%d) %s\n", (int)errno, strerror(errno) );
+                }
 …
+        }
+        if( options.log ) printf("%.*s\n", rlen, buffer);
+        if( options.log ) {
+                write(sout, buffer, rlen);
+                sout | nl;
+        }
         it = buffer;
 …
+}
 void sendfile( int pipe[2], int fd, int ans_fd, size_t count ) {
+int sendfile( int pipe[2], int fd, int ans_fd, size_t count ) {
         unsigned sflags = SPLICE_F_MOVE; // | SPLICE_F_MORE;
         off_t offset = 0;
 …
                 if( ret < 0 ) {
                         if( errno != EAGAIN && errno != EWOULDBLOCK) continue SPLICE1;
+                        if( errno == ECONNRESET ) return -ECONNRESET;
+                        if( errno == EPIPE ) return -EPIPE;
                         abort( "splice [0] error: (%d) %s\n", (int)errno, strerror(errno) );
+                }
 …
                         if( ret < 0 ) {
                                 if( errno != EAGAIN && errno != EWOULDBLOCK) continue SPLICE2;
+                                if( errno == ECONNRESET ) return -ECONNRESET;
+                                if( errno == EPIPE ) return -EPIPE;
                                 abort( "splice [1] error: (%d) %s\n", (int)errno, strerror(errno) );
+                        }
 …
+        }
+        return count;
+}

benchmark/io/http/protocol.hfa

-              rb6a8b31
+              rd95969a
         E400,
         E404,
+        E405,
+        E408,
         E413,
         E414,
 …
 [HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len, io_cancellation *);
 void sendfile( int pipe[2], int fd, int ans_fd, size_t count );
+int sendfile( int pipe[2], int fd, int ans_fd, size_t count );

benchmark/io/http/worker.cfa

-              rb6a8b31
+              rd95969a
 #include <unistd.h>
+#include <fstream.hfa>
 #include <iofwd.hfa>
 …
         CONNECTION:
         for() {
                 if( options.log ) printf("=== Accepting connection ===\n");
+                if( options.log ) sout | "=== Accepting connection ===";
                 int fd = cfa_accept4( this.[sockfd, addr, addrlen, flags], 0, -1`s, &this.cancel, 0p );
                 // int fd = accept4( this.[sockfd, addr, addrlen, flags] );
                 if(fd < 0) {
                         if( errno == ECONNABORTED ) break;
                         if( errno == EINVAL && this.done ) break;
+                        if( this.done && (errno == EINVAL || errno == EBADF) ) break;
                         abort( "accept error: (%d) %s\n", (int)errno, strerror(errno) );
+                }
                 if( options.log ) printf("=== New connection %d, waiting for requests ===\n", fd);
+                if( options.log ) sout | "=== New connection" | fd | "" | ", waiting for requests ===";
                 REQUEST:
                 for() {
 …
                         size_t len = options.socket.buflen;
                         char buffer[len];
                         if( options.log ) printf("=== Reading request ===\n");
+                        if( options.log ) sout | "=== Reading request ===";
                         [code, closed, file, name_size] = http_read(fd, buffer, len, &this.cancel);
                         // if we are done, break out of the loop
+                        if( closed ) {
+                                if( options.log ) printf("=== Connection closed ===\n");
+                                close(fd);
+                                continue CONNECTION;
+                        }
+                        if( closed ) break REQUEST;
                         // If this wasn't a request retrun 400
                         if( code != OK200 ) {
                                 printf("=== Invalid Request : %d ===\n", code_val(code));
+                                sout | "=== Invalid Request :" | code_val(code) | "===";
                                 answer_error(fd, code);
                                 continue REQUEST;
 …
                         if(0 == strncmp(file, "plaintext", min(name_size, sizeof("plaintext") ))) {
                                 if( options.log ) printf("=== Request for /plaintext ===\n");
+                                if( options.log ) sout | "=== Request for /plaintext ===";
                                 char text[] = "Hello, World!\n";
                                 // Send the header
+                                answer_plain(fd, text, sizeof(text));
+                                int ret = answer_plain(fd, text, sizeof(text));
+                                if( ret == -ECONNRESET ) break REQUEST;
                                 if( options.log ) printf("=== Answer sent ===\n");
+                                if( options.log ) sout | "=== Answer sent ===";
                                 continue REQUEST;
+                        }
                         if(0 == strncmp(file, "ping", min(name_size, sizeof("ping") ))) {
                                 if( options.log ) printf("=== Request for /ping ===\n");
+                                if( options.log ) sout | "=== Request for /ping ===";
                                 // Send the header
+                                answer_empty(fd);
+                                int ret = answer_empty(fd);
+                                if( ret == -ECONNRESET ) break REQUEST;
                                 if( options.log ) printf("=== Answer sent ===\n");
+                                if( options.log ) sout | "=== Answer sent ===";
                                 continue REQUEST;
+                        }
+                        if( options.log ) printf("=== Request for file %.*s ===\n", (int)name_size, file);
+                        if( options.log ) {
+                                sout | "=== Request for file " | nonl;
+                                write(sout, file, name_size);
+                                sout | " ===";
+                        }
+                        if( !options.file_cache.path ) {
+                                if( options.log ) {
+                                        sout | "=== File Not Found (" | nonl;
+                                        write(sout, file, name_size);
+                                        sout | ") ===";
+                                }
+                                answer_error(fd, E405);
+                                continue REQUEST;
+                        }
                         // Get the fd from the file cache
 …
                         // If we can't find the file, return 404
                         if( ans_fd < 0 ) {
+                                printf("=== File Not Found ===\n");
+                                if( options.log ) {
+                                        sout | "=== File Not Found (" | nonl;
+                                        write(sout, file, name_size);
+                                        sout | ") ===";
+                                }
                                 answer_error(fd, E404);
                                 continue REQUEST;
 …
                         // Send the header
+                        answer_header(fd, count);
+                        int ret = answer_header(fd, count);
+                        if( ret == -ECONNRESET ) break REQUEST;
                         // Send the desired file
+                        sendfile( this.pipe, fd, ans_fd, count);
+                        ret = sendfile( this.pipe, fd, ans_fd, count);
+                        if( ret == -ECONNRESET ) break REQUEST;
                         if( options.log ) printf("=== Answer sent ===\n");
+                        if( options.log ) sout | "=== Answer sent ===";
+                }
+                if( options.log ) sout | "=== Connection closed ===";
+                close(fd);
+                continue CONNECTION;
+        }
+}

doc/LaTeXmacros/common.tex

-              rb6a8b31
+              rd95969a
 %% Created On       : Sat Apr  9 10:06:17 2016
 %% Last Modified By : Peter A. Buhr
 %% Last Modified On : Mon Oct  5 09:34:46 2020
 %% Update Count     : 464
+%% Last Modified On : Sat Jan 23 09:06:39 2021
+%% Update Count     : 491
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 …
 \usepackage{listings}                                                                   % format program code
 \usepackage{lstlang}
+\usepackage{calc}                                                                               % latex arithmetic
 \makeatletter
 \newcommand{\LstBasicStyle}[1]{{\lst@basicstyle{#1}}}
 \newcommand{\LstKeywordStyle}[1]{{\lst@basicstyle{\lst@keywordstyle{#1}}}}
 …
 showlines=true,                                                 % show blank lines at end of code
 aboveskip=4pt,                                                  % spacing above/below code block
+belowskip=3pt,
+belowskip=-2pt,
+numberstyle=\footnotesize\sf,                   % numbering style
 % replace/adjust listing characters that look bad in sanserif
 literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.75ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptscriptstyle\land\,$}}1
 …
 language=CFA,
 escapechar=\$,                                                  % LaTeX escape in CFA code
+mathescape=false,                                               % LaTeX math escape in CFA code $...$
 moredelim=**[is][\color{red}]{@}{@},    % red highlighting @...@
 }% lstset

doc/bibliography/pl.bib

-              rb6a8b31
+              rd95969a
     title       = {Asynchronous Exception Propagation in Blocked Tasks},
     booktitle   = {4th International Workshop on Exception Handling (WEH.08)},
     organization= {16th International Symposium on the Foundations of Software Engineering (FSE 16)},
+    optorganization= {16th International Symposium on the Foundations of Software Engineering (FSE 16)},
     address     = {Atlanta, U.S.A},
     month       = nov,
 …
 @inproceedings{Edelson92,
     keywords    = {persistence, pointers},
+    keywords    = {persistence, smart pointers},
     contributer = {pabuhr@plg},
     author      = {Daniel R. Edelson},
 …
     year        = 1992,
     pages       = {1-19},
+}
+@incollection{smartpointers,
+    keywords    = {smart pointers},
+    contributer = {pabuhr@plg},
+    author      = {Andrei Alexandrescu},
+    title       = {Smart Pointers},
+    booktitle   = {Modern C++ Design: Generic Programming and Design Patterns Applied},
+    publisher   = {Addison-Wesley},
+    year        = 2001,
+    chapter     = 7,
+    optpages    = {?-?},
+}
 …
+}
+@misc{vistorpattern,
+    keywords    = {visitor pattern},
+    contributer = {pabuhr@plg},
+    key         = {vistor pattern},
+    title       = {vistor pattern},
+    year        = 2020,
+    note        = {WikipediA},
+    howpublished= {\href{https://en.wikipedia.org/wiki/Visitor\_pattern}
+                  {https://\-en.wikipedia.org/\-wiki/\-Visitor\_pattern}},
+}
 % W

doc/theses/andrew_beach_MMath/.gitignore

rb6a8b31	rd95969a
3	3
4	4	# Final Files:
5		~~thesis~~.pdf
	5	*.pdf
6	6
7	7	# The Makefile here is not generated.

doc/theses/andrew_beach_MMath/Makefile

-              rb6a8b31
+              rd95969a
 ### Makefile for Andrew Beach's Masters Thesis
 DOC=thesis.pdf
+DOC=uw-ethesis.pdf
 BUILD=out
 TEXSRC=$(wildcard *.tex)
 …
 STYSRC=$(wildcard *.sty)
 CLSSRC=$(wildcard *.cls)
 TEXLIB= .:${BUILD}:
+TEXLIB= .:../../LaTeXmacros:${BUILD}:
 BIBLIB= .:../../bibliography
 …
         ${LATEX} ${BASE}
         ${BIBTEX} ${BUILD}/${BASE}
+        ${LATEX} ${BASE}
         ${GLOSSARY} ${BUILD}/${BASE}
         ${LATEX} ${BASE}

doc/theses/andrew_beach_MMath/existing.tex

-              rb6a8b31
+              rd95969a
+\chapter{\CFA{} Existing Features}
+\section{Overloading and extern}
+Cforall has overloading, allowing multiple definitions of the same name to
+be defined.
+This also adds name mangling so that the assembly symbols are unique for
+different overloads. For compatability with names in C there is also a
+syntax to diable the name mangling. These unmangled names cannot be overloaded
+but act as the interface between C and \CFA code.
+The syntax for disabling mangling is:
+\begin{lstlisting}
+extern "C" {
+    ...
+}
+\end{lstlisting}
+To re-enable mangling once it is disabled the syntax is:
+\begin{lstlisting}
+extern "Cforall" {
+    ...
+}
+\end{lstlisting}
+Both should occur at the declaration level and effect all the declarations
+in \texttt{...}. Neither care about the state of mangling when they begin
+and will return to that state after the group is finished. So re-enabling
+is only used to nest areas of mangled and unmangled declarations.
+\section{References}
+\CFA adds references to C. These are auto-dereferencing pointers and use the
+same syntax as pointers except they use ampersand (\codeCFA{\&}) instead of
+the asterisk (\codeCFA{*}). They can also be constaint or mutable, if they
+are mutable they may be assigned to by using the address-of operator
+(\codeCFA\&) which converts them into a pointer.
+\chapter{\texorpdfstring{\CFA Existing Features}{Cforall Existing Features}}
+\CFA (C-for-all)~\cite{Cforall} is an open-source project extending ISO C with
+modern safety and productivity features, while still ensuring backwards
+compatibility with C and its programmers.  \CFA is designed to have an
+orthogonal feature-set based closely on the C programming paradigm
+(non-object-oriented) and these features can be added incrementally to an
+existing C code-base allowing programmers to learn \CFA on an as-needed basis.
+Only those \CFA features pertinent to this thesis are discussed.  Many of the
+\CFA syntactic and semantic features used in the thesis should be fairly
+obvious to the reader.
+\section{\texorpdfstring{Overloading and \lstinline|extern|}{Overloading and extern}}
+\CFA has extensive overloading, allowing multiple definitions of the same name
+to be defined.~\cite{Moss18}
+\begin{cfa}
+char i; int i; double i;                        $\C[3.75in]{// variable overload}$
+int f(); double f();                            $\C{// return overload}$
+void g( int ); void g( double );        $\C{// parameter overload}\CRT$
+\end{cfa}
+This feature requires name mangling so the assembly symbols are unique for
+different overloads. For compatibility with names in C, there is also a syntax
+to disable name mangling. These unmangled names cannot be overloaded but act as
+the interface between C and \CFA code.  The syntax for disabling/enabling
+mangling is:
+\begin{cfa}
+// name mangling
+int i; // _X1ii_1
+@extern "C"@ {  // no name mangling
+        int j; // j
+        @extern "Cforall"@ {  // name mangling
+                int k; // _X1ki_1
+        }
+        // no name mangling
+}
+// name mangling
+\end{cfa}
+Both forms of @extern@ affect all the declarations within their nested lexical
+scope and transition back to the previous mangling state when the lexical scope
+ends.
+\section{Reference Type}
+\CFA adds a rebindable reference type to C, but more expressive than the \CC
+reference.  Multi-level references are allowed and act like auto-dereferenced
+pointers using the ampersand (@&@) instead of the pointer asterisk (@*@). \CFA
+references may also be mutable or non-mutable. If mutable, a reference variable
+may be assigned to using the address-of operator (@&@), which converts the
+reference to a pointer.
+\begin{cfa}
+int i, j;
+int @&@ ri = i, @&&@ rri = ri;
+rri = 3;  // auto-dereference assign to i
+@&@ri = @&@j; // rebindable
+ri = 5;   // assign to j
+\end{cfa}
 \section{Constructors and Destructors}
 Both constructors and destructors are operators, which means they are just
+functions with special names. The special names are used to define them and
+may be used to call the functions expicately. The \CFA special names are
+constructed by taking the tokens in the operators and putting \texttt{?} where
+the arguments would go. So multiplication is \texttt{?*?} while dereference
+is \texttt{*?}. This also make it easy to tell the difference between
+pre-fix operations (such as \texttt{++?}) and post-fix operations
+(\texttt{?++}).
+The special name for contructors is \texttt{?\{\}}, which comes from the
+initialization syntax in C. The special name for destructors is
+\texttt{\^{}?\{\}}. % I don't like the \^{} symbol but $^\wedge$ isn't better.
+Any time a type T goes out of scope the destructor matching
+\codeCFA{void ^?\{\}(T \&);} is called. In theory this is also true of
+primitive types such as \codeCFA{int}, but in practice those are no-ops and
+are usually omitted for optimization.
+functions with special operator names rather than type names in \CC. The
+special operator names may be used to call the functions explicitly (not
+allowed in \CC for constructors).
+In general, operator names in \CFA are constructed by bracketing an operator
+token with @?@, which indicates where the arguments. For example, infixed
+multiplication is @?*?@ while prefix dereference is @*?@. This syntax make it
+easy to tell the difference between prefix operations (such as @++?@) and
+post-fix operations (@?++@).
+The special name for a constructor is @?{}@, which comes from the
+initialization syntax in C. The special name for a destructor is @^{}@, where
+the @^@ has no special meaning.
+% I don't like the \^{} symbol but $^\wedge$ isn't better.
+\begin{cfa}
+struct T { ... };
+void ?@{}@(@T &@ this, ...) { ... }  // constructor
+void ?@^{}@(@T &@ this, ...) { ... } // destructor
+{
+        T s = @{@ ... @}@;  // same constructor/initialization braces
+} // destructor call automatically generated
+\end{cfa}
+The first parameter is a reference parameter to the type for the
+constructor/destructor. Destructors may have multiple parameters.  The compiler
+implicitly matches an overloaded constructor @void ^?{}(T &, ...);@ to an
+object declaration with associated initialization, and generates a construction
+call after the object is allocated. When an object goes out of scope, the
+matching overloaded destructor @void ^?{}(T &);@ is called.  Without explicit
+definition, \CFA creates a default and copy constructor, destructor and
+assignment (like \CC). It is possible to define constructors/destructors for
+basic and existing types.
 \section{Polymorphism}
+\CFA uses polymorphism to create functions and types that are defined over
+different types. \CFA polymorphic declarations serve the same role as \CPP
+templates or Java generics.
+Polymorphic declaractions start with a forall clause that goes before the
+standard (monomorphic) declaration. These declarations have the same syntax
+except that you may use the names introduced by the forall clause in them.
+Forall clauses are written \codeCFA{forall( ... )} where \codeCFA{...} becomes
+the list of polymorphic variables (local type names) and assertions, which
+repersent required operations on those types.
+\begin{lstlisting}
+forall(dtype T | { void do_once(T &); })
+void do_twice(T & value) {
+    do_once(value);
+    do_once(value);
+}
+\end{lstlisting}
+A polymorphic function can be used in the same way normal functions are.
+The polymorphics variables are filled in with concrete types and the
+assertions are checked. An assertion checked by seeing if that name of that
+type (with all the variables replaced with the concrete types) is defined at
+the the call site.
+As an example, even if no function named \codeCFA{do_once} is not defined
+near the definition of \codeCFA{do_twice} the following code will work.
+\begin{lstlisting}
+\CFA uses parametric polymorphism to create functions and types that are
+defined over multiple types. \CFA polymorphic declarations serve the same role
+as \CC templates or Java generics. The ``parametric'' means the polymorphism is
+accomplished by passing argument operations to associate \emph{parameters} at
+the call site, and these parameters are used in the function to differentiate
+among the types the function operates on.
+Polymorphic declarations start with a universal @forall@ clause that goes
+before the standard (monomorphic) declaration. These declarations have the same
+syntax except they may use the universal type names introduced by the @forall@
+clause.  For example, the following is a polymorphic identity function that
+works on any type @T@:
+\begin{cfa}
+@forall( T )@ @T@ identity( @T@ val ) { return val; }
+int forty_two = identity( 42 ); // T bound to int, forty_two == 42
+\end{cfa}
+To allow a polymorphic function to be separately compiled, the type @T@ must be
+constrained by the operations used on @T@ in the function body. The @forall@
+clauses is augmented with a list of polymorphic variables (local type names)
+and assertions (constraints), which represent the required operations on those
+types used in a function, \eg:
+\begin{cfa}
+forall( T @| { void do_once(T); }@) // assertion
+void do_twice(T value) {
+        do_once(value);
+        do_once(value);
+}
+void do_once(int i) { ... }  // provide assertion
+int i;
+do_twice(i); // implicitly pass assertion do_once to do_twice
+\end{cfa}
+Any object with a type fulfilling the assertion may be passed as an argument to
+a @do_twice@ call.
+A polymorphic function can be used in the same way as a normal function.  The
+polymorphic variables are filled in with concrete types and the assertions are
+checked. An assertion is checked by verifying each assertion operation (with
+all the variables replaced with the concrete types from the arguments) is
+defined at a call site.
+Note, a function named @do_once@ is not required in the scope of @do_twice@ to
+compile it, unlike \CC template expansion. Furthermore, call-site inferencing
+allows local replacement of the most specific parametric functions needs for a
+call.
+\begin{cfa}
+void do_once(double y) { ... } // global
 int quadruple(int x) {
+    void do_once(int & y) {
+        y = y * 2;
+    }
+    do_twice(x);
+    return x;
+}
+\end{lstlisting}
+This is not the recommended way to implement a quadruple function but it
+does work. The complier will deduce that \codeCFA{do_twice}'s T is an
+integer from the argument. It will then look for a definition matching the
+assertion which is the \codeCFA{do_once} defined within the function. That
+function will be passed in as a function pointer to \codeCFA{do_twice} and
+called within it.
+To avoid typing out long lists of assertions again and again there are also
+traits which collect assertions into convenent packages that can then be used
+in assertion lists instead of all of their components.
+\begin{lstlisting}
+trait done_once(dtype T) {
+    void do_once(T &);
+}
+\end{lstlisting}
+After this the forall list in the previous example could instead be written
+with the trait instead of the assertion itself.
+\begin{lstlisting}
+forall(dtype T | done_once(T))
+\end{lstlisting}
+Traits can have arbitrary number of assertions in them and are usually used to
+create short hands for, and give descriptive names to, commond groupings of
+assertions.
+Polymorphic structures and unions may also be defined by putting a forall
+clause before the declaration. The type variables work the same way except
+are now used in field declaractions instead of parameters and local variables.
+\begin{lstlisting}
+        void do_once(int y) { y = y * 2; } // local
+        do_twice(x); // using local "do_once"
+        return x;
+}
+\end{cfa}
+Specifically, the complier deduces that @do_twice@'s T is an integer from the
+argument @x@. It then looks for the most specific definition matching the
+assertion, which is the nested integral @do_once@ defined within the
+function. The matched assertion function is then passed as a function pointer
+to @do_twice@ and called within it.
+To avoid typing long lists of assertions, constraints can be collect into
+convenient packages called a @trait@, which can then be used in an assertion
+instead of the individual constraints.
+\begin{cfa}
+trait done_once(T) {
+        void do_once(T);
+}
+\end{cfa}
+and the @forall@ list in the previous example is replaced with the trait.
+\begin{cfa}
+forall(dtype T | @done_once(T)@)
+\end{cfa}
+In general, a trait can contain an arbitrary number of assertions, both
+functions and variables, and are usually used to create a shorthand for, and
+give descriptive names to, common groupings of assertions describing a certain
+functionality, like @sumable@, @listable@, \etc.
+Polymorphic structures and unions are defined by qualifying the aggregate type
+with @forall@. The type variables work the same except they are used in field
+declarations instead of parameters, returns, and local variable declarations.
+\begin{cfa}
 forall(dtype T)
 struct node {
+    node(T) * next;
+    T * data;
+}
+\end{lstlisting}
+The \codeCFA{node(T)} is a use of a polymorphic structure. Polymorphic types
+must be provided their polymorphic parameters.
+There are many other features of polymorphism that have not given here but
+these are the ones used by the exception system.
+        node(T) * next;  // generic linked node
+        T * data;
+}
+\end{cfa}
+The generic type @node(T)@ is an example of a polymorphic-type usage.  Like \CC
+templates usage, a polymorphic-type usage must specify a type parameter.
+There are many other polymorphism features in \CFA but these are the ones used
+by the exception system.
 \section{Concurrency}
+\CFA has a number of concurrency features, \codeCFA{thread}s,
+\codeCFA{monitor}s and \codeCFA{mutex} parameters, \codeCFA{coroutine}s and
+\codeCFA{generator}s. The two features that interact with the exception system
+are \codeCFA{thread}s and \codeCFA{coroutine}s; they and their supporting
+constructs will be described here.
+\subsection{Coroutines}
+Coroutines are routines that do not have to finish execution to hand control
+back to their caller, instead they may suspend their execution at any time
+and resume it later.
+Coroutines are not true concurrency but share some similarities and many of
+the same underpinnings and so are included as part of the \CFA threading
+library.
+In \CFA coroutines are created using the \codeCFA{coroutine} keyword which
+works just like \codeCFA{struct} except that the created structure will be
+modified by the compiler to satify the \codeCFA{is_coroutine} trait.
+These structures act as the interface between callers and the coroutine,
+the fields are used to pass information in and out. Here is a simple example
+where the single field is used to pass the next number in a sequence out.
+\begin{lstlisting}
+\CFA has a number of concurrency features: @thread@, @monitor@, @mutex@
+parameters, @coroutine@ and @generator@. The two features that interact with
+the exception system are @thread@ and @coroutine@; they and their supporting
+constructs are described here.
+\subsection{Coroutine}
+A coroutine is a type with associated functions, where the functions are not
+required to finish execution when control is handed back to the caller. Instead
+they may suspend execution at any time and be resumed later at the point of
+last suspension. (Generators are stackless and coroutines are stackful.) These
+types are not concurrent but share some similarities along with common
+underpinnings, so they are combined with the \CFA threading library. Further
+discussion in this section only refers to the coroutine because generators are
+similar.
+In \CFA, a coroutine is created using the @coroutine@ keyword, which is an
+aggregate type like @struct,@ except the structure is implicitly modified by
+the compiler to satisfy the @is_coroutine@ trait; hence, a coroutine is
+restricted by the type system to types that provide this special trait.  The
+coroutine structure acts as the interface between callers and the coroutine,
+and its fields are used to pass information in and out of coroutine interface
+functions.
+Here is a simple example where a single field is used to pass (communicate) the
+next number in a sequence.
+\begin{cfa}
 coroutine CountUp {
+    unsigned int next;
+}
+\end{lstlisting}
+The routine part of the coroutine is a main function for the coroutine. It
+takes a reference to a coroutine object and returns nothing. In this function,
+and any functions called by this function, the suspend statement may be used
+to return execution to the coroutine's caller. When control returns to the
+function it continue from that same suspend statement instead of at the top
+of the function.
+\begin{lstlisting}
+void main(CountUp & this) {
+    unsigned int next = 0;
+    while (true) {
+        this.next = next;
+        suspend;
+        next = next + 1;
+    }
+}
+\end{lstlisting}
+Control is passed to the coroutine with the resume function. This includes the
+first time when the coroutine is starting up. The resume function takes a
+reference to the coroutine structure and returns the same reference. The
+return value is for easy access to communication variables. For example the
+next value from a count-up can be generated and collected in a single
+expression: \codeCFA{resume(count).next}.
+        unsigned int next; // communication variable
+}
+CountUp countup;
+\end{cfa}
+Each coroutine has @main@ function, which takes a reference to a coroutine
+object and returns @void@.
+\begin{cfa}[numbers=left]
+void main(@CountUp & this@) { // argument matches trait is_coroutine
+        unsigned int up = 0;  // retained between calls
+        while (true) {
+                next = up; // make "up" available outside function
+                @suspend;@$\label{suspend}$
+                up += 1;
+        }
+}
+\end{cfa}
+In this function, or functions called by this function (helper functions), the
+@suspend@ statement is used to return execution to the coroutine's caller
+without terminating the coroutine.
+A coroutine is resumed by calling the @resume@ function, \eg @resume(countup)@.
+The first resume calls the @main@ function at the top. Thereafter, resume calls
+continue a coroutine in the last suspended function after the @suspend@
+statement, in this case @main@ line~\ref{suspend}.  The @resume@ function takes
+a reference to the coroutine structure and returns the same reference. The
+return value allows easy access to communication variables defined in the
+coroutine object. For example, the @next@ value for coroutine object @countup@
+is both generated and collected in the single expression:
+@resume(countup).next@.
 \subsection{Monitors and Mutex}
+True concurrency does not garrenty ordering. To get some of that ordering back
+\CFA uses monitors and mutex (mutual exclution) parameters. A monitor is
+another special declaration that contains a lock and is compatable with mutex
+parameters.
+Function parameters can have the \codeCFA{mutex} qualifiers on reference
+arguments, for example \codeCFA{void example(a_monitor & mutex arg);}. When the
+function is called it will acquire the lock on all of the mutex parameters.
+This means that all functions that mutex on a type are part of a critical
+section and only one will ever run at a time.
+Concurrency does not guarantee ordering; without ordering results are
+non-deterministic. To claw back ordering, \CFA uses monitors and @mutex@
+(mutual exclusion) parameters. A monitor is another kind of aggregate, where
+the compiler implicitly inserts a lock and instances are compatible with
+@mutex@ parameters.
+A function that requires deterministic (ordered) execution, acquires mutual
+exclusion on a monitor object by qualifying an object reference parameter with
+@mutex@.
+\begin{cfa}
+void example(MonitorA & @mutex@ argA, MonitorB & @mutex@ argB);
+\end{cfa}
+When the function is called, it implicitly acquires the monitor lock for all of
+the mutex parameters without deadlock.  This semantics means all functions with
+the same mutex type(s) are part of a critical section for objects of that type
+and only one runs at a time.
 \subsection{Threads}
+While coroutines allow new things to be done with a single execution path
+threads actually introduce new paths of execution that continue independently.
+Now for threads to work together their must be some communication between them
+and that means the timing of certain operations does have to be known. There
+or various means of syncronization and mutual exclution provided by \CFA but
+for exceptions only the basic two -- fork and join -- are needed.
+Threads are created like coroutines except the keyword is changed:
+\begin{lstlisting}
+Functions, generators, and coroutines are sequential so there is only a single
+(but potentially sophisticated) execution path in a program. Threads introduce
+multiple execution paths that continue independently.
+For threads to work safely with objects requires mutual exclusion using
+monitors and mutex parameters. For threads to work safely with other threads,
+also requires mutual exclusion in the form of a communication rendezvous, which
+also supports internal synchronization as for mutex objects. For exceptions
+only the basic two basic operations are important: thread fork and join.
+Threads are created like coroutines with an associated @main@ function:
+\begin{cfa}
 thread StringWorker {
     const char * input;
     int result;
+        const char * input;
+        int result;
 };
 void main(StringWorker & this) {
+    const char * localCopy = this.input;
+    // ... do some work, perhaps hashing the string ...
+    this.result = result;
+}
+\end{lstlisting}
+The main function will start executing after the fork operation and continue
+executing until it is finished. If another thread joins with this one it will
+wait until main has completed execution. In other words everything the thread
+does is between fork and join.
+From the outside this is the creation and destruction of the thread object.
+Fork happens after the constructor is run and join happens before the
+destructor runs. Join also happens during the \codeCFA{join} function which
+can be used to join a thread earlier. If it is used the destructor does not
+join as that has already been completed.
+        const char * localCopy = this.input;
+        // ... do some work, perhaps hashing the string ...
+        this.result = result;
+}
+{
+        StringWorker stringworker; // fork thread running in "main"
+} // implicitly join with thread $\(\Rightarrow\)$ wait for completion
+\end{cfa}
+The thread main is where a new thread starts execution after a fork operation
+and then the thread continues executing until it is finished. If another thread
+joins with an executing thread, it waits until the executing main completes
+execution. In other words, everything a thread does is between a fork and join.
+From the outside, this behaviour is accomplished through creation and
+destruction of a thread object.  Implicitly, fork happens after a thread
+object's constructor is run and join happens before the destructor runs. Join
+can also be specified explicitly using the @join@ function to wait for a
+thread's completion independently from its deallocation (\ie destructor
+call). If @join@ is called explicitly, the destructor does not implicitly join.

doc/theses/andrew_beach_MMath/features.tex

-              rb6a8b31
+              rd95969a
+\chapter{Features}
+This chapter covers the design and user interface of the \CFA exception
+handling mechanism.
+\section{Virtual Casts}
+Virtual casts and virtual types are not truly part of the exception system but
+they did not exist in \CFA and are useful in exceptions. So a minimal version
+of they virtual system was designed and implemented.
+Virtual types are organizied in simple hierarchies. Each virtual type may have
+a parent and can have any number of children. A type's descendants are its
+children and its children's descendants. A type may not be its own descendant.
+Each virtual type has an associated virtual table type. A virtual table is a
+structure that has fields for all the virtual members of a type. A virtual
+type has all the virtual members of its parent and can add more. It may also
+update the values of the virtual members.
+Except for virtual casts, this is only used internally in the exception
+system. There is no general purpose interface for the other features. A
+a virtual cast has the following syntax:
+\begin{lstlisting}
+\chapter{Exception Features}
+This chapter covers the design and user interface of the \CFA
+exception-handling mechanism.
+\section{Virtuals}
+Virtual types and casts are not required for a basic exception-system but are
+useful for advanced exception features. However, \CFA is not object-oriented so
+there is no obvious concept of virtuals.  Hence, to create advanced exception
+features for this work, I needed to designed and implemented a virtual-like
+system for \CFA.
+Object-oriented languages often organized exceptions into a simple hierarchy,
+\eg Java.
+\begin{center}
+\setlength{\unitlength}{4000sp}%
+\begin{picture}(1605,612)(2011,-1951)
+\put(2100,-1411){\vector(1, 0){225}}
+\put(3450,-1411){\vector(1, 0){225}}
+\put(3550,-1411){\line(0,-1){225}}
+\put(3550,-1636){\vector(1, 0){150}}
+\put(3550,-1636){\line(0,-1){225}}
+\put(3550,-1861){\vector(1, 0){150}}
+\put(2025,-1490){\makebox(0,0)[rb]{\LstBasicStyle{exception}}}
+\put(2400,-1460){\makebox(0,0)[lb]{\LstBasicStyle{arithmetic}}}
+\put(3750,-1460){\makebox(0,0)[lb]{\LstBasicStyle{underflow}}}
+\put(3750,-1690){\makebox(0,0)[lb]{\LstBasicStyle{overflow}}}
+\put(3750,-1920){\makebox(0,0)[lb]{\LstBasicStyle{zerodivide}}}
+\end{picture}%
+\end{center}
+The hierarchy provides the ability to handle an exception at different degrees
+of specificity (left to right).  Hence, it is possible to catch a more general
+exception-type in higher-level code where the implementation details are
+unknown, which reduces tight coupling to the lower-level implementation.
+Otherwise, low-level code changes require higher-level code changes, \eg,
+changing from raising @underflow@ to @overflow@ at the low level means changing
+the matching catch at the high level versus catching the general @arithmetic@
+exception. In detail, each virtual type may have a parent and can have any
+number of children. A type's descendants are its children and its children's
+descendants. A type may not be its own descendant.
+The exception hierarchy allows a handler (@catch@ clause) to match multiple
+exceptions, \eg a base-type handler catches both base and derived
+exception-types.
+\begin{cfa}
+try {
+        ...
+} catch(arithmetic &) {
+        ... // handle arithmetic, underflow, overflow, zerodivide
+}
+\end{cfa}
+Most exception mechanisms perform a linear search of the handlers and select
+the first matching handler, so the order of handers is now important because
+matching is many to one.
+Each virtual type needs an associated virtual table. A virtual table is a
+structure with fields for all the virtual members of a type. A virtual type has
+all the virtual members of its parent and can add more. It may also update the
+values of the virtual members and often does.
+While much of the virtual infrastructure is created, it is currently only used
+internally for exception handling. The only user-level feature is the virtual
+cast, which is the same as the \CC \lstinline[language=C++]|dynamic_cast|.
+\begin{cfa}
 (virtual TYPE)EXPRESSION
+\end{lstlisting}
+This has the same precidence as a traditional C-cast and can be used in the
+same places. This will convert the result of EXPRESSION to the type TYPE. Both
+the type of EXPRESSION and TYPE must be pointers to virtual types.
+The cast is checked and will either return the original value or null, based
+on the result of the check. The check is does the object pointed at have a
+type that is a descendant of the target type. If it is the result is the
+pointer, otherwise the result is null.
+\section{Exceptions}
+\end{cfa}
+Note, the syntax and semantics matches a C-cast, rather than the unusual \CC
+syntax for special casts. Both the type of @EXPRESSION@ and @TYPE@ must be a
+pointer to a virtual type. The cast dynamically checks if the @EXPRESSION@ type
+is the same or a subtype of @TYPE@, and if true, returns a pointer to the
+@EXPRESSION@ object, otherwise it returns @0p@ (null pointer).
+\section{Exception}
 % Leaving until later, hopefully it can talk about actual syntax instead
 % of my many strange macros. Syntax aside I will also have to talk about the
 % features all exceptions support.
+\section{Termination}
+Termination exception throws are likely the most framilar kind, as they are
+used in several popular programming languages. A termination will throw an
+exception, search the stack for a handler, unwind the stack to where the
+handler is defined, execute the handler and then continue execution after
+the handler. They are used when execution cannot continue here.
+Termination has two pieces of syntax it uses. The first is the throw:
+\begin{lstlisting}
+Exceptions are defined by the trait system; there are a series of traits, and
+if a type satisfies them, then it can be used as an exception.  The following
+is the base trait all exceptions need to match.
+\begin{cfa}
+trait is_exception(exceptT &, virtualT &) {
+        virtualT const & @get_exception_vtable@(exceptT *);
+};
+\end{cfa}
+The function takes any pointer, including the null pointer, and returns a
+reference to the virtual-table object. Defining this function also establishes
+the virtual type and a virtual-table pair to the \CFA type-resolver and
+promises @exceptT@ is a virtual type and a child of the base exception-type.
+{\color{blue} PAB: I do not understand this paragraph.}
+One odd thing about @get_exception_vtable@ is that it should always be a
+constant function, returning the same value regardless of its argument.  A
+pointer or reference to the virtual table instance could be used instead,
+however using a function has some ease of implementation advantages and allows
+for easier disambiguation because the virtual type name (or the address of an
+instance that is in scope) can be used instead of the mangled virtual table
+name.  Also note the use of the word ``promise'' in the trait
+description. Currently, \CFA cannot check to see if either @exceptT@ or
+@virtualT@ match the layout requirements. This is considered part of
+@get_exception_vtable@'s correct implementation.
+\section{Raise}
+\CFA provides two kinds of exception raise: termination (see
+\VRef{s:Termination}) and resumption (see \VRef{s:Resumption}), which are
+specified with the following traits.
+\begin{cfa}
+trait is_termination_exception(
+                exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
+        void @defaultTerminationHandler@(exceptT &);
+};
+\end{cfa}
+The function is required to allow a termination raise, but is only called if a
+termination raise does not find an appropriate handler.
+Allowing a resumption raise is similar.
+\begin{cfa}
+trait is_resumption_exception(
+                exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
+        void @defaultResumptionHandler@(exceptT &);
+};
+\end{cfa}
+The function is required to allow a resumption raise, but is only called if a
+resumption raise does not find an appropriate handler.
+Finally there are three convenience macros for referring to the these traits:
+@IS_EXCEPTION@, @IS_TERMINATION_EXCEPTION@ and @IS_RESUMPTION_EXCEPTION@.  Each
+takes the virtual type's name, and for polymorphic types only, the
+parenthesized list of polymorphic arguments. These macros do the name mangling
+to get the virtual-table name and provide the arguments to both sides
+{\color{blue}(PAB: What's a ``side''?)}
+\subsection{Termination}
+\label{s:Termination}
+Termination raise, called ``throw'', is familiar and used in most programming
+languages with exception handling. The semantics of termination is: search the
+stack for a matching handler, unwind the stack frames to the matching handler,
+execute the handler, and continue execution after the handler. Termination is
+used when execution \emph{cannot} return to the throw. To continue execution,
+the program must \emph{recover} in the handler from the failed (unwound)
+execution at the raise to safely proceed after the handler.
+A termination raise is started with the @throw@ statement:
+\begin{cfa}
 throw EXPRESSION;
+\end{lstlisting}
+The expression must evaluate to a reference to a termination exception. A
+termination exception is any exception with a
+\codeCFA{void defaultTerminationHandler(T &);} (the default handler) defined
+on it. The handler is taken from the call sight with \CFA's trait system and
+passed into the exception system along with the exception itself.
+The exception passed into the system is then copied into managed memory.
+This is to ensure it remains in scope during unwinding. It is the user's
+responsibility to make sure the original exception is freed when it goes out
+of scope. Being allocated on the stack is sufficient for this.
+Then the exception system will search the stack starting from the throw and
+proceding towards the base of the stack, from callee to caller. As it goes
+it will check any termination handlers it finds:
+\begin{lstlisting}
+try {
+    TRY_BLOCK
+} catch (EXCEPTION_TYPE * NAME) {
+    HANDLER
+}
+\end{lstlisting}
+This shows a try statement with a single termination handler. The statements
+in TRY\_BLOCK will be executed when control reaches this statement. While
+those statements are being executed if a termination exception is thrown and
+it is not handled by a try statement further up the stack the EHM will check
+all of the terminations handlers attached to the try block, top to bottom.
+At each handler the EHM will check to see if the thrown exception is a
+descendant of EXCEPTION\_TYPE. If it is the pointer to the exception is
+bound to NAME and the statements in HANDLER are executed. If control reaches
+the end of the handler then it exits the block, the exception is freed and
+control continues after the try statement.
+The default handler is only used if no handler for the exception is found
+after the entire stack is searched. When that happens the default handler
+is called with a reference to the exception as its only argument. If the
+handler returns control continues from after the throw statement.
+\paragraph{Conditional Catches}
+Catch clauses may also be written as:
+\begin{lstlisting}
+catch (EXCEPTION_TYPE * NAME ; CONDITION)
+\end{lstlisting}
+This has the same behaviour as a regular catch clause except that if the
+exception matches the given type the condition is also run. If the result is
+true only then is this considered a matching handler. If the result is false
+then the handler does not match and the search continues with the next clause
+in the try block.
+The condition considers all names in scope at the beginning of the try block
+to be in scope along with the name introduce in the catch clause itself.
+\paragraph{Re-Throwing}
+You can also rethrow the most recent termination exception with
+\codeCFA{throw;}. % This is terrible and you should never do it.
+This can be done in a handler or any function that could be called from a
+handler.
+This will start another termination throw reusing the exception, meaning it
+does not copy the exception or allocated any more memory for it. However the
+default handler is still at the original through and could refer to data that
+was on the unwound section of the stack. So instead a new default handler that
+does a program level abort is used.
+\section{Resumption}
+Resumption exceptions are less popular then termination but in many
+regards are simpler and easier to understand. A resumption throws an exception,
+searches for a handler on the stack, executes that handler on top of the stack
+and then continues execution from the throw. These are used when a problem
+needs to be fixed before execution continues.
+A resumption is thrown with a throw resume statement:
+\begin{lstlisting}
+\end{cfa}
+The expression must return a termination-exception reference, where the
+termination exception has a type with a @void defaultTerminationHandler(T &)@
+(default handler) defined. The handler is found at the call site using \CFA's
+trait system and passed into the exception system along with the exception
+itself.
+At runtime, a representation of the exception type and an instance of the
+exception type is copied into managed memory (heap) to ensure it remains in
+scope during unwinding. It is the user's responsibility to ensure the original
+exception object at the throw is freed when it goes out of scope. Being
+allocated on the stack is sufficient for this.
+Then the exception system searches the stack starting from the throw and
+proceeding towards the base of the stack, from callee to caller. At each stack
+frame, a check is made for termination handlers defined by the @catch@ clauses
+of a @try@ statement.
+\begin{cfa}
+try {
+        GUARDED_BLOCK
+} @catch (EXCEPTION_TYPE$\(_1\)$ * NAME)@ { // termination handler 1
+        HANDLER_BLOCK$\(_1\)$
+} @catch (EXCEPTION_TYPE$\(_2\)$ * NAME)@ { // termination handler 2
+        HANDLER_BLOCK$\(_2\)$
+}
+\end{cfa}
+The statements in the @GUARDED_BLOCK@ are executed. If those statements, or any
+functions invoked from those statements, throws an exception, and the exception
+is not handled by a try statement further up the stack, the termination
+handlers are searched for a matching exception type from top to bottom.
+Exception matching checks the representation of the thrown exception-type is
+the same or a descendant type of the exception types in the handler clauses. If
+there is a match, a pointer to the exception object created at the throw is
+bound to @NAME@ and the statements in the associated @HANDLER_BLOCK@ are
+executed. If control reaches the end of the handler, the exception is freed,
+and control continues after the try statement.
+The default handler visible at the throw statement is used if no matching
+termination handler is found after the entire stack is searched. At that point,
+the default handler is called with a reference to the exception object
+generated at the throw. If the default handler returns, the system default
+action is executed, which often terminates the program. This feature allows
+each exception type to define its own action, such as printing an informative
+error message, when an exception is not handled in the program.
+\subsection{Resumption}
+\label{s:Resumption}
+Resumption raise, called ``resume'', is as old as termination
+raise~\cite{Goodenough75} but is less popular. In many ways, resumption is
+simpler and easier to understand, as it is simply a dynamic call (as in
+Lisp). The semantics of resumption is: search the stack for a matching handler,
+execute the handler, and continue execution after the resume. Notice, the stack
+cannot be unwound because execution returns to the raise point. Resumption is
+used used when execution \emph{can} return to the resume. To continue
+execution, the program must \emph{correct} in the handler for the failed
+execution at the raise so execution can safely continue after the resume.
+A resumption raise is started with the @throwResume@ statement:
+\begin{cfa}
 throwResume EXPRESSION;
+\end{lstlisting}
+The result of EXPRESSION must be a resumption exception type. A resumption
+exception type is any type that satifies the assertion
+\codeCFA{void defaultResumptionHandler(T &);} (the default handler). When the
+statement is executed the expression is evaluated and the result is thrown.
+Handlers are declared using clauses in try statements:
+\begin{lstlisting}
+try {
+    TRY_BLOCK
+} catchResume (EXCEPTION_TYPE * NAME) {
+    HANDLER
+}
+\end{lstlisting}
+This is a simple example with the try block and a single resumption handler.
+Multiple resumption handlers can be put in a try statement and they can be
+mixed with termination handlers.
+When a resumption begins it will start searching the stack starting from
+the throw statement and working its way to the callers. In each try statement
+handlers will be tried top to bottom. Each handler is checked by seeing if
+the thrown exception is a descendant of EXCEPTION\_TYPE. If not the search
+continues. Otherwise NAME is bound to a pointer to the exception and the
+HANDLER statements are executed. After they are finished executing control
+continues from the throw statement.
+If no approprate handler is found then the default handler is called. The
+throw statement acts as a regular function call passing the exception to
+the default handler and after the handler finishes executing control continues
+from the throw statement.
+The exception system also tracks the position of a search on the stack. If
+another resumption exception is thrown while a resumption handler is running
+it will first check handlers pushed to the stack by the handler and any
+functions it called, then it will continue from the try statement that the
+handler is a part of; except for the default handler where it continues from
+the throw the default handler was passed to.
+This makes the search pattern for resumption reflect the one for termination,
+which is what most users expect.
+% This might need a diagram. But it is an important part of the justifaction
+\end{cfa}
+The semantics of the @throwResume@ statement are like the @throw@, but the
+expression has a type with a @void defaultResumptionHandler(T &)@ (default
+handler) defined, where the handler is found at the call site by the type
+system.  At runtime, a representation of the exception type and an instance of
+the exception type is \emph{not} copied because the stack is maintained during
+the handler search.
+Then the exception system searches the stack starting from the resume and
+proceeding towards the base of the stack, from callee to caller. At each stack
+frame, a check is made for resumption handlers defined by the @catchResume@
+clauses of a @try@ statement.
+\begin{cfa}
+try {
+        GUARDED_BLOCK
+} @catchResume (EXCEPTION_TYPE$\(_1\)$ * NAME)@ { // resumption handler 1
+        HANDLER_BLOCK$\(_1\)$
+} @catchResume (EXCEPTION_TYPE$\(_2\)$ * NAME)@ { // resumption handler 2
+        HANDLER_BLOCK$\(_2\)$
+}
+\end{cfa}
+The statements in the @GUARDED_BLOCK@ are executed. If those statements, or any
+functions invoked from those statements, resumes an exception, and the
+exception is not handled by a try statement further up the stack, the
+resumption handlers are searched for a matching exception type from top to
+bottom. (Note, termination and resumption handlers may be intermixed in a @try@
+statement but the kind of raise (throw/resume) only matches with the
+corresponding kind of handler clause.)
+The exception search and matching for resumption is the same as for
+termination, including exception inheritance. The difference is when control
+reaches the end of the handler: the resumption handler returns after the resume
+rather than after the try statement. The resume point assumes the handler has
+corrected the problem so execution can safely continue.
+Like termination, if no resumption handler is found, the default handler
+visible at the resume statement is called, and the system default action is
+executed.
+For resumption, the exception system uses stack marking to partition the
+resumption search. If another resumption exception is raised in a resumption
+handler, the second exception search does not start at the point of the
+original raise. (Remember the stack is not unwound and the current handler is
+at the top of the stack.) The search for the second resumption starts at the
+current point on the stack because new try statements may have been pushed by
+the handler or functions called from the handler. If there is no match back to
+the point of the current handler, the search skips the stack frames already
+searched by the first resume and continues after the try statement. The default
+handler always continues from default handler associated with the point where
+the exception is created.
+% This might need a diagram. But it is an important part of the justification
 % of the design of the traversal order.
+It also avoids the recursive resumption problem. If the entire stack is
+searched loops of resumption can form. Consider a handler that handles an
+exception of type A by resuming an exception of type B and on the same stack,
+later in the search path, is a second handler that handles B by resuming A.
+Assuming no other handlers on the stack handle A or B then in either traversal
+system an A resumed from the top of the stack will be handled by the first
+handler. A B resumed from the top or from the first handler it will be handled
+by the second hander. The only difference is when A is thrown from the second
+handler. The entire stack search will call the first handler again, creating a
+loop. Starting from the position in the stack though will break this loop.
+\paragraph{Conditional Catches}
+Resumption supports conditional catch clauses like termination does. They
+use the same syntax except the keyword is changed:
+\begin{lstlisting}
+catchResume (EXCEPTION_TYPE * NAME ; CONDITION)
+\end{lstlisting}
+It also has the same behaviour, after the exception type has been matched
+with the EXCEPTION\_TYPE the CONDITION is evaluated with NAME in scope. If
+the result is true then the hander is run, otherwise the search continues
+just as if there had been a type mismatch.
+\paragraph{Re-Throwing}
+You may also re-throw resumptions with a \codeCFA{throwResume;} statement.
+This can only be done from inside of a \codeCFA{catchResume} block.
+Outside of any side effects of any code already run in the handler this will
+have the same effect as if the exception had not been caught in the first
+place.
+\begin{verbatim}
+       throwResume2 ----------.
+            |                 |
+ generated from handler       |
+            |                 |
+         handler              |
+            |                 |
+        throwResume1 -----.   :
+            |             |   :
+           try            |   : search skip
+            |             |   :
+        catchResume  <----'   :
+            |                 |
+\end{verbatim}
+This resumption search-pattern reflect the one for termination, which matches
+with programmer expectations. However, it avoids the \emph{recursive
+resumption} problem. If parts of the stack are searched multiple times, loops
+can easily form resulting in infinite recursion.
+Consider the trivial case:
+\begin{cfa}
+try {
+        throwResume$\(_1\)$ (E &){};
+} catch( E * ) {
+        throwResume;
+}
+\end{cfa}
+Based on termination semantics, programmer expectation is for the re-resume to
+continue searching the stack frames after the try statement. However, the
+current try statement is still on the stack below the handler issuing the
+reresume (see \VRef{s:Reraise}). Hence, the try statement catches the re-raise
+again and does another re-raise \emph{ad infinitum}, which is confusing and
+difficult to debug. The \CFA resumption search-pattern skips the try statement
+so the reresume search continues after the try, mathcing programmer
+expectation.
+\section{Conditional Catch}
+Both termination and resumption handler-clauses may perform conditional matching:
+\begin{cfa}
+catch (EXCEPTION_TYPE * NAME ; @CONDITION@)
+\end{cfa}
+First, the same semantics is used to match the exception type. Second, if the
+exception matches, @CONDITION@ is executed. The condition expression may
+reference all names in scope at the beginning of the try block and @NAME@
+introduced in the handler clause.  If the condition is true, then the handler
+matches. Otherwise, the exception search continues at the next appropriate kind
+of handler clause in the try block.
+\begin{cfa}
+try {
+        f1 = open( ... );
+        f2 = open( ... );
+        ...
+} catch( IOFailure * f ; fd( f ) == f1 ) {
+        // only handle IO failure for f1
+}
+\end{cfa}
+Note, catching @IOFailure@, checking for @f1@ in the handler, and reraising the
+exception if not @f1@ is different because the reraise does not examine any of
+remaining handlers in the current try statement.
+\section{Reraise}
+\label{s:Reraise}
+Within the handler block or functions called from the handler block, it is
+possible to reraise the most recently caught exception with @throw@ or
+@throwResume@, respective.
+\begin{cfa}
+catch( ... ) {
+        ... throw; // rethrow
+} catchResume( ... ) {
+        ... throwResume; // reresume
+}
+\end{cfa}
+The only difference between a raise and a reraise is that reraise does not
+create a new exception; instead it continues using the current exception, \ie
+no allocation and copy. However the default handler is still set to the one
+visible at the raise point, and hence, for termination could refer to data that
+is part of an unwound stack frame. To prevent this problem, a new default
+handler is generated that does a program-level abort.
 \section{Finally Clauses}
+A \codeCFA{finally} clause may be placed at the end of a try statement after
+all the handler clauses. In the simply case, with no handlers, it looks like
+this:
+\begin{lstlisting}
+try {
+    TRY_BLOCK
+A @finally@ clause may be placed at the end of a @try@ statement.
+\begin{cfa}
+try {
+        GUARDED_BLOCK
+} ...   // any number or kind of handler clauses
 } finally {
+    FINAL_STATEMENTS
+}
+\end{lstlisting}
+Any number of termination handlers and resumption handlers may proceed the
+finally clause.
+The FINAL\_STATEMENTS, the finally block, are executed whenever the try
+statement is removed from the stack. This includes: the TRY\_BLOCK finishes
+executing, a termination exception finishes executing and the stack unwinds.
+Execution of the finally block should finish by letting control run off
+the end of the block. This is because after the finally block is complete
+control will continue to where ever it would if the finally clause was not
+present.
+Because of this local control flow out of the finally block is forbidden.
+The compiler rejects uses of \codeCFA{break}, \codeCFA{continue},
+\codeCFA{fallthru} and \codeCFA{return} that would cause control to leave
+the finally block. Other ways to leave the finally block - such as a long
+jump or termination - are much harder to check, at best requiring additional
+runtime overhead, and so are merely discouraged.
+        FINALLY_BLOCK
+}
+\end{cfa}
+The @FINALLY_BLOCK@ is executed when the try statement is unwound from the
+stack, \ie when the @GUARDED_BLOCK@ or any handler clause finishes. Hence, the
+finally block is always executed.
+Execution of the finally block should always finish, meaning control runs off
+the end of the block. This requirement ensures always continues as if the
+finally clause is not present, \ie finally is for cleanup not changing control
+flow.  Because of this requirement, local control flow out of the finally block
+is forbidden.  The compiler precludes any @break@, @continue@, @fallthru@ or
+@return@ that causes control to leave the finally block. Other ways to leave
+the finally block, such as a long jump or termination are much harder to check,
+and at best requiring additional run-time overhead, and so are discouraged.
 \section{Cancellation}
+Cancellation can be thought of as a stack-level abort or as an uncatchable
+termination. It unwinds the entirety of the current exception and if possible
+passes an exception to a different stack as a message.
+There is no special statement for starting a cancellation, instead you call
+the standard libary function \codeCFA{cancel\_stack} which takes an exception.
+Unlike in a throw this exception is not used in control flow but is just there
+to pass information about why the cancellation happened.
+The handler is decided entirely by which stack is being cancelled. There are
+three handlers that apply to three different groups of stacks:
+\begin{itemize}
+\item Main Stack:
+The main stack is the one on which the program main is called at the beginning
+of your program. It is also the only stack you have without the libcfathreads.
+Because of this there is no other stack ``above" (or possibly at all) for main
+to notify when a cancellation occurs. So after the stack is unwound we do a
+program level abort.
+\item Thread Stack:
+Thread stacks are those created \codeCFA{thread} or otherwise satify the
+\codeCFA{is\_thread} trait.
+Threads only have two structural points of communication that must happen,
+start and join. As the thread must be running to preform a cancellation it
+will be after start and before join, so join is one cancellation uses.
+After the stack is unwound the thread will halt as if had completed normally
+and wait for another thread to join with it. The other thread, when it joins,
+checks for a cancellation. If so it will throw the resumption exception
+\codeCFA{ThreadCancelled}.
+There is a difference here in how explicate joins (with the \codeCFA{join}
+function) and implicate joins (from a destructor call). Explicate joins will
+take the default handler (\codeCFA{defaultResumptionHandler}) from the context
+and use like a regular through does if the exception is not caught. The
+implicate join does a program abort instead.
+This is for safety. One of the big problems in exceptions is you cannot handle
+two terminations or cancellations on the same stack as either can destroy the
+context required for the other. This can happen with join but as the
+destructors will always be run when the stack is being unwound and one
+termination/cancellation is already active. Also since they are implicite they
+are easier to forget about.
+\item Coroutine Stack:
+Coroutine stacks are those created with \codeCFA{coroutine} or otherwise
+satify the \codeCFA{is\_coroutine} trait.
+A coroutine knows of two other coroutines, its starter and its last resumer.
+The last resumer is ``closer" so that is the one notified.
+After the stack is unwound control goes to the last resumer.
+Resume will resume throw a \codeCFA{CoroutineCancelled} exception, which is
+polymorphic over the coroutine type and has a pointer to the coroutine being
+cancelled and the cancelling exception. The resume function also has an
+assertion that the \codeCFA{defaultResumptionHandler} for the exception. So it
+will use the default handler like a regular throw.
+\end{itemize}
+Cancellation is a stack-level abort, which can be thought of as as an
+uncatchable termination. It unwinds the entirety of the current stack, and if
+possible forwards the cancellation exception to a different stack.
+There is no special statement for starting a cancellation; instead the standard
+library function @cancel_stack@ is called passing an exception.  Unlike a
+raise, this exception is not used in matching only to pass information about
+the cause of the cancellation.
+Handling of a cancellation depends on which stack is being cancelled.
+\begin{description}
+\item[Main Stack:]
+The main stack is the one used by the program main at the start of execution,
+and is the only stack in a sequential program.  Hence, when cancellation is
+forwarded to the main stack, there is no other forwarding stack, so after the
+stack is unwound, there is a program-level abort.
+\item[Thread Stack:]
+A thread stack is created for a @thread@ object or object that satisfies the
+@is_thread@ trait.  A thread only has two points of communication that must
+happen: start and join. As the thread must be running to perform a
+cancellation, it must occur after start and before join, so join is a
+cancellation point.  After the stack is unwound, the thread halts and waits for
+another thread to join with it. The joining thread, checks for a cancellation,
+and if present, resumes exception @ThreadCancelled@.
+There is a subtle difference between the explicit join (@join@ function) and
+implicit join (from a destructor call). The explicit join takes the default
+handler (@defaultResumptionHandler@) from its calling context, which is used if
+the exception is not caught. The implicit join does a program abort instead.
+This semantics is for safety. One difficult problem for any exception system is
+defining semantics when an exception is raised during an exception search:
+which exception has priority, the original or new exception? No matter which
+exception is selected, it is possible for the selected one to disrupt or
+destroy the context required for the other. {\color{blue} PAB: I do not
+understand the following sentences.} This loss of information can happen with
+join but as the thread destructor is always run when the stack is being unwound
+and one termination/cancellation is already active. Also since they are
+implicit they are easier to forget about.
+\item[Coroutine Stack:] A coroutine stack is created for a @coroutine@ object
+or object that satisfies the @is_coroutine@ trait.  A coroutine only knows of
+two other coroutines, its starter and its last resumer.  The last resumer has
+the tightest coupling to the coroutine it activated.  Hence, cancellation of
+the active coroutine is forwarded to the last resumer after the stack is
+unwound, as the last resumer has the most precise knowledge about the current
+execution. When the resumer restarts, it resumes exception
+@CoroutineCancelled@, which is polymorphic over the coroutine type and has a
+pointer to the cancelled coroutine.
+The resume function also has an assertion that the @defaultResumptionHandler@
+for the exception. So it will use the default handler like a regular throw.
+\end{description}

doc/theses/andrew_beach_MMath/future.tex

-              rb6a8b31
+              rd95969a
 parts of the exception system that use the current version.
+For instance a full virtual system would probably allow for several
+improvements to the exception traits. Although they do currently work they
+could be made easier to use by making the virtual table type implitate in the
+trait (which would remove the need for those wrapper marcos) or allowing
+for assertions that give the layout of a virtual table for safety.
+There are several improvements to the virtual system that would improve
+the exception traits. The biggest one is an assertion that checks that one
+virtual type is a child of another virtual type. This would capture many of
+the requirements much more precisely.
+The full virtual system might also include other improvement like associated
+types. This is a proposed feature that would allow traits to refer to types
+not listed in their header. This would allow the exception traits to not
+refer to the virtual table type explicatly which would remove the need for
+the interface macros.
 \section{Additional Throws}
 Several other kinds of throws, beyond the termination throw (\codeCFA{throw}),
 the resumption throw (\codeCFA{throwResume}) and the re-throws, were considered.
+Several other kinds of throws, beyond the termination throw (@throw@),
+the resumption throw (@throwResume@) and the re-throws, were considered.
 None were as useful as the core throws but they would likely be worth
 revising.
 …
 Also new techniques to skip previously searched parts of the stack will have
+to be developed.
+to be developed. The recursive resume problem still remains and ideally the
+same pattern of ignoring sections of the stack.
+\section{Support for More Platforms}
+Termination is not portable because it is implemented with inline assembly.
+Those sections will have to be rewritten to support different architectures
+\section{Signal Exceptions}
+Exception Handling: Issues and a Proposed Notation suggests there are three
+types of exceptions: escape, notify and signal.
+Escape exceptions are our termination exceptions, notify exceptions are
+resumption exceptions and that leaves signal exception unimplemented.
+\section{Quality-of-Life Improvements}
+Finally come various improvements to the usability of \CFA. Most of these
+would just require time. Time that would not lead to interesting research so
+it has been left aside for now. A few examples are included here but there
+are more:
+Signal exceptions allow either behaviour, that is after the exception is
+handled control can either return to the throw or from where the handler is
+defined.
+The design should be rexamined and be updated for \CFA. A very direct
+translation would perhaps have a new throw and catch pair and a statement
+(or statements) could be used to decide if the handler returns to the throw
+or continues where it is, but there are other options.
+For instance resumption could be extended to cover this use by allowing
+local control flow out of it. This would require an unwind as part of the
+transition as there are stack frames that have to be removed.
+This would mean there is no notify like throw but because \CFA does not have
+exception signatures a termination can be thrown from any resumption handler
+already so there are already ways one could try to do this in existing \CFA.
+% Maybe talk about the escape; and escape CONTROL_STMT; statements or how
+% if we could choose if _Unwind_Resume proceeded to the clean-up stage this
+% would be much easier to implement.
+\section{Language Improvements}
+There is also a lot of work that are not follow ups to this work in terms of
+research, some have no interesting research to be done at all, but would
+improve \CFA as a programming language. The full list of these would
+naturally be quite extensive but here are a few examples that involve
+exceptions:
 \begin{itemize}
+\item The implementation of termination is not portable because it includes
+some assembly statements. These sections will have to be re-written to so
+\CFA has full support on more machines.
 \item Allowing exception handler to bind the exception to a reference instead
 of a pointer. This should actually result in no change in behaviour so there
 is no reason not to allow it. It is however a small improvement; giving a bit
 of flexibility to the user in what style they want to use.
 \item Enabling local control flow (by \codeCFA{break}, \codeCFA{return} and
+\item Enabling local control flow (by @break@, @return@ and
 similar statements) out of a termination handler. The current set-up makes
 this very difficult but the catch function that runs the handler after it has
 …
 much easier. (To do the same for try blocks would probably wait for zero-cost
 exceptions, which would allow the try block to be inlined as well.)
-\item Enabling local control flow out of a resumption handler. This would be
-a weighty operation, causing a stack unwind like a termination, so there might
-be a different statement or a statement modifier to make sure the user does
-this purposefully.
-However this would require the more complex system as they cannot be inlined
-into the original function as they can be run at a different place on the
-stack. So instead the unwinding will have to carry with it information on
-which one of these points to continue at and possibly also the return value
-for the function if a \codeCFA{return} statement was used.
 \end{itemize}

doc/theses/andrew_beach_MMath/implement.tex

-              rb6a8b31
+              rd95969a
 All of this is accessed through a field inserted at the beginning of every
 virtual type. Currently it is called \codeC{virtual_table} but it is not
+virtual type. Currently it is called @virtual_table@ but it is not
 ment to be accessed by the user. This field is a pointer to the type's
 virtual table instance. It is assigned once during the object's construction
 …
 using that to calculate the mangled name of the parent's virtual table type.
 There are two special fields that are included like normal fields but have
 special initialization rules: the \codeC{size} field is the type's size and is
 initialized with a sizeof expression, the \codeC{align} field is the type's
+special initialization rules: the @size@ field is the type's size and is
+initialized with a sizeof expression, the @align@ field is the type's
 alignment and uses an alignof expression. The remaining fields are resolved
 to a name matching the field's name and type using the normal visibility
 …
 The declarations include the virtual type definition and forward declarations
 of the virtual table instance, constructor, message function and
 \codeCFA{get_exception_vtable}. The definition includes the storage and
+@get_exception_vtable@. The definition includes the storage and
 initialization of the virtual table instance and the bodies of the three
 functions.
 …
 from the per-instance information. The virtual table type and most of the
 functions are polymorphic so they are all part of the core. The virtual table
 instance and the \codeCFA{get_exception_vtable} function.
 Coroutines and threads need instances of \codeCFA{CoroutineCancelled} and
 \codeCFA{ThreadCancelled} respectively to use all of their functionality.
 When a new data type is declared with \codeCFA{coroutine} or \codeCFA{thread}
+instance and the @get_exception_vtable@ function.
+Coroutines and threads need instances of @CoroutineCancelled@ and
+@ThreadCancelled@ respectively to use all of their functionality.
+When a new data type is declared with @coroutine@ or @thread@
 the forward declaration for the instance is created as well. The definition
 of the virtual table is created at the definition of the main function.
 …
 function.
 The function is \codeC{__cfa__virtual_cast} and it is implemented in the
+The function is @__cfa__virtual_cast@ and it is implemented in the
 standard library. It takes a pointer to the target type's virtual table and
 the object pointer being cast. The function is very simple, getting the
 …
 For the generated code a forward decaration of the virtual works as follows.
 There is a forward declaration of \codeC{__cfa__virtual_cast} in every cfa
+There is a forward declaration of @__cfa__virtual_cast@ in every cfa
 file so it can just be used. The object argument is the expression being cast
 so that is just placed in the argument list.
 …
 often across functions.
 At a very basic level this can be done with \codeC{setjmp} \& \codeC{longjmp}
+At a very basic level this can be done with @setjmp@ \& @longjmp@
 which simply move the top of the stack, discarding everything on the stack
 above a certain point. However this ignores all the clean-up code that should
 …
 both of these problems.
 Libunwind, provided in \texttt{unwind.h} on most platorms, is a C library
+Libunwind, provided in @unwind.h@ on most platorms, is a C library
 that provides \CPP style stack unwinding. Its operation is divided into two
 phases. The search phase -- phase 1 -- is used to scan the stack and decide
 …
 GCC will generate an LSDA and attach its personality function with the
 \texttt{-fexceptions} flag. However this only handles the cleanup attribute.
+@-fexceptions@ flag. However this only handles the cleanup attribute.
 This attribute is used on a variable and specifies a function that should be
 run when the variable goes out of scope. The function is passed a pointer to
 …
 messages for special cases (some of which should never be used by the
 personality function) and error codes but unless otherwise noted the
 personality function should always return \codeC{_URC_CONTINUE_UNWIND}.
 The \codeC{version} argument is the verson of the implementation that is
+personality function should always return @_URC_CONTINUE_UNWIND@.
+The @version@ argument is the verson of the implementation that is
 calling the personality function. At this point it appears to always be 1 and
 it will likely stay that way until a new version of the API is updated.
 The \codeC{action} argument is set of flags that tell the personality
+The @action@ argument is set of flags that tell the personality
 function when it is being called and what it must do on this invocation.
 The flags are as follows:
 \begin{itemize}
 \item\codeC{_UA_SEARCH_PHASE}: This flag is set whenever the personality
+\item@_UA_SEARCH_PHASE@: This flag is set whenever the personality
 function is called during the search phase. The personality function should
 decide if unwinding will stop in this function or not. If it does then the
 personality function should return \codeC{_URC_HANDLER_FOUND}.
 \item\codeC{_UA_CLEANUP_PHASE}: This flag is set whenever the personality
+personality function should return @_URC_HANDLER_FOUND@.
+\item@_UA_CLEANUP_PHASE@: This flag is set whenever the personality
 function is called during the cleanup phase. If no other flags are set this
 means the entire frame will be unwound and all cleanup code should be run.
 \item\codeC{_UA_HANDLER_FRAME}: This flag is set during the cleanup phase
+\item@_UA_HANDLER_FRAME@: This flag is set during the cleanup phase
 on the function frame that found the handler. The personality function must
 prepare to return to normal code execution and return
 \codeC{_URC_INSTALL_CONTEXT}.
 \item\codeC{_UA_FORCE_UNWIND}: This flag is set if the personality function
+@_URC_INSTALL_CONTEXT@.
+\item@_UA_FORCE_UNWIND@: This flag is set if the personality function
 is called through a forced unwind call. Forced unwind only performs the
 cleanup phase and uses a different means to decide when to stop. See its
 …
 \end{itemize}
 The \codeC{exception_class} argument is a copy of the \codeC{exception}'s
 \codeC{exception_class} field.
 The \codeC{exception} argument is a pointer to the user provided storage
+The @exception_class@ argument is a copy of the @exception@'s
+@exception_class@ field.
+The @exception@ argument is a pointer to the user provided storage
 object. It has two public fields, the exception class which is actually just
 a number that identifies the exception handling mechanism that created it and
 …
 exception needs to
 The \codeC{context} argument is a pointer to an opaque type. This is passed
+The @context@ argument is a pointer to an opaque type. This is passed
 to the many helper functions that can be called inside the personality
 function.
 …
 functions traversing the stack new-to-old until a function finds a handler or
 the end of the stack is reached. In the latter case raise exception will
 return with \codeC{_URC_END_OF_STACK}.
+return with @_URC_END_OF_STACK@.
 Once a handler has been found raise exception continues onto the the cleanup
 …
 If an error is encountered raise exception will return either
 \codeC{_URC_FATAL_PHASE1_ERROR} or \codeC{_URC_FATAL_PHASE2_ERROR} depending
+@_URC_FATAL_PHASE1_ERROR@ or @_URC_FATAL_PHASE2_ERROR@ depending
 on when the error occured.
 …
 been unwound.
 Each time it is called the stop function should return \codeC{_URC_NO_REASON}
+Each time it is called the stop function should return @_URC_NO_REASON@
 or transfer control directly to other code outside of libunwind. The
 framework does not provide any assistance here.
 Its arguments are the same as the paired personality function.
 The actions \codeC{_UA_CLEANUP_PHASE} and \codeC{_UA_FORCE_UNWIND} are always
+The actions @_UA_CLEANUP_PHASE@ and @_UA_FORCE_UNWIND@ are always
 set when it is called. By the official standard that is all but both GCC and
 Clang add an extra action on the last call at the end of the stack:
 \codeC{_UA_END_OF_STACK}.
+@_UA_END_OF_STACK@.
 \section{Exception Context}
 …
 Each stack has its own exception context. In a purely sequental program, using
 only core Cforall, there is only one stack and the context is global. However
 if the library \texttt{libcfathread} is linked then there can be multiple
+if the library @libcfathread@ is linked then there can be multiple
 stacks so they will each need their own.
 To handle this code always gets the exception context from the function
 \codeC{this_exception_context}. The main exception handling code is in
 \texttt{libcfa} and that library also defines the function as a weak symbol
 so it acts as a default. Meanwhile in \texttt{libcfathread} the function is
+@this_exception_context@. The main exception handling code is in
+@libcfa@ and that library also defines the function as a weak symbol
+so it acts as a default. Meanwhile in @libcfathread@ the function is
 defined as a strong symbol that replaces it when the libraries are linked
 together.
 The version of the function defined in \texttt{libcfa} is very simple. It
+The version of the function defined in @libcfa@ is very simple. It
 returns a pointer to a global static variable. With only one stack this
 global instance is associated with the only stack.
 The version of the function defined in \texttt{libcfathread} has to handle
+The version of the function defined in @libcfathread@ has to handle
 more as there are multiple stacks. The exception context is included as
 part of the per-stack data stored as part of coroutines. In the cold data
 section, stored at the base of each stack, is the exception context for that
 stack. The \codeC{this_exception_context} uses the concurrency library to get
+stack. The @this_exception_context@ uses the concurrency library to get
 the current coroutine and through it the cold data section and the exception
 context.
 …
 to store the exception. Macros with pointer arthritic and type cast are
 used to move between the components or go from the embedded
 \codeC{_Unwind_Exception} to the entire node.
+@_Unwind_Exception@ to the entire node.
 All of these nodes are strung together in a linked list. One linked list per
 …
 C which is what the \CFA compiler outputs so a work-around is used.
 This work around is a function called \codeC{__cfaehm_try_terminate} in the
+This work around is a function called @__cfaehm_try_terminate@ in the
 standard library. The contents of a try block and the termination handlers
 are converted into functions. These are then passed to the try terminate
 …
 These nested functions and all other functions besides
 \codeC{__cfaehm_try_terminate} in \CFA use the GCC personality function and
 the \texttt{-fexceptions} flag to generate the LSDA. This allows destructors
+@__cfaehm_try_terminate@ in \CFA use the GCC personality function and
+the @-fexceptions@ flag to generate the LSDA. This allows destructors
 to be implemented with the cleanup attribute.
 …
 The handler function does both the matching and catching. It tries each
 the condition of \codeCFA{catchResume} in order, top-to-bottom and until it
+the condition of @catchResume@ in order, top-to-bottom and until it
 finds a handler that matches. If no handler matches then the function returns
 false. Otherwise the matching handler is run, if it completes successfully
 the function returns true. Rethrows, through the \codeCFA{throwResume;}
+the function returns true. Rethrows, through the @throwResume;@
 statement, cause the function to return true.
+% Recursive Resumption Stuff:
+Blocking out part of the stack is accomplished by updating the front of the
+list as the search continues. Before the handler at a node is called the head
+of the list is updated to the next node of the current node. After the search
+is complete, successful or not, the head of the list is reset.
+This means the current handler and every handler that has already been
+checked are not on the list while a handler is run. If a resumption is thrown
+during the handling of another resumption the active handlers and all the
+other handler checked up to this point will not be checked again.
+This structure also supports new handler added while the resumption is being
+handled. These are added to the front of the list, pointing back along the
+stack -- the first one will point over all the checked handlers -- and the
+ordering is maintained.
 \subsection{Libunwind Compatibility}
 …
 Cancellation also uses libunwind to do its stack traversal and unwinding,
 however it uses a different primary function \codeC{_Unwind_ForcedUnwind}.
+however it uses a different primary function @_Unwind_ForcedUnwind@.
 Details of its interface can be found in the unwind section.

doc/theses/andrew_beach_MMath/unwinding.tex

-              rb6a8b31
+              rd95969a
 \chapter{Unwinding in \CFA}
+\chapter{\texorpdfstring{Unwinding in \CFA}{Unwinding in Cforall}}
 Stack unwinding is the process of removing things from the stack. Within
 …
 Even this is fairly simple if nothing needs to happen when the stack unwinds.
 Traditional C can unwind the stack by saving and restoring state (with
 \codeC{setjmp} \& \codeC{longjmp}). However many languages define actions that
+@setjmp@ \& @longjmp@). However many languages define actions that
 have to be taken when something is removed from the stack, such as running
 a variable's destructor or a \codeCFA{try} statement's \codeCFA{finally}
+a variable's destructor or a @try@ statement's @finally@
 clause. Handling this requires walking the stack going through each stack
 frame.
 …
 \CFA uses two primary functions in libunwind to create most of its
 exceptional control-flow: \codeC{_Unwind_RaiseException} and
 \codeC{_Unwind_ForcedUnwind}.
+exceptional control-flow: @_Unwind_RaiseException@ and
+@_Unwind_ForcedUnwind@.
 Their operation is divided into two phases: search and clean-up. The search
 phase -- phase 1 -- is used to scan the stack but not unwinding it. The
 …
 A personality function performs three tasks, although not all have to be
 present. The tasks performed are decided by the actions provided.
 \codeC{_Unwind_Action} is a bitmask of possible actions and an argument of
+@_Unwind_Action@ is a bitmask of possible actions and an argument of
 this type is passed into the personality function.
 \begin{itemize}
 \item\codeC{_UA_SEARCH_PHASE} is passed in search phase and tells the
+\item@_UA_SEARCH_PHASE@ is passed in search phase and tells the
 personality function to check for handlers. If there is a handler in this
 stack frame, as defined by the language, the personality function should
 return \codeC{_URC_HANDLER_FOUND}. Otherwise it should return
 \codeC{_URC_CONTINUE_UNWIND}.
 \item\codeC{_UA_CLEANUP_PHASE} is passed in during the clean-up phase and
+return @_URC_HANDLER_FOUND@. Otherwise it should return
+@_URC_CONTINUE_UNWIND@.
+\item@_UA_CLEANUP_PHASE@ is passed in during the clean-up phase and
 means part or all of the stack frame is removed. The personality function
 should do whatever clean-up the language defines
 (such as running destructors/finalizers) and then generally returns
 \codeC{_URC_CONTINUE_UNWIND}.
 \item\codeC{_UA_HANDLER_FRAME} means the personality function must install
+@_URC_CONTINUE_UNWIND@.
+\item@_UA_HANDLER_FRAME@ means the personality function must install
 a handler. It is also passed in during the clean-up phase and is in addition
 to the clean-up action. libunwind provides several helpers for the personality
 function here. Once it is done, the personality function must return
 \codeC{_URC_INSTALL_CONTEXT}.
+@_URC_INSTALL_CONTEXT@.
 \end{itemize}
 The personality function is given a number of other arguments. Some are for
 compatability and there is the \codeC{struct _Unwind_Context} pointer which
+compatability and there is the @struct _Unwind_Context@ pointer which
 passed to many helpers to get information about the current stack frame.
 …
 raise-exception but with some extras.
 The first it passes in an extra action to the personality function on each
 stack frame, \codeC{_UA_FORCE_UNWIND}, which means a handler cannot be
+stack frame, @_UA_FORCE_UNWIND@, which means a handler cannot be
 installed.
 …
 stack frames have been removed. By the standard API this is marked by setting
 the stack pointer inside the context passed to the stop function. However both
 GCC and Clang add an extra action for this case \codeC{_UA_END_OF_STACK}.
+GCC and Clang add an extra action for this case @_UA_END_OF_STACK@.
 Each time function the stop function is called it can do one or two things.
 When it is not the end of the stack it can return \codeC{_URC_NO_REASON} to
+When it is not the end of the stack it can return @_URC_NO_REASON@ to
 continue unwinding.
 % Is there a reason that NO_REASON is used instead of CONTINUE_UNWIND?
 …
 are provided to do it.
 \section{\CFA Implementation}
+\section{\texorpdfstring{\CFA Implementation}{Cforall Implementation}}
 To use libunwind, \CFA provides several wrappers, its own storage,
 …
 The stop function is very simple. It checks the end of stack flag to see if
 it is finished unwinding. If so, it calls \codeC{exit} to end the process,
+it is finished unwinding. If so, it calls @exit@ to end the process,
 otherwise it returns with no-reason to continue unwinding.
 % Yeah, this is going to have to change.
 …
 location of the instruction pointer and stack layout, which varies with
 compiler and optimization levels. So for frames where there are only
 destructors, GCC's attribute cleanup with the \texttt{-fexception} flag is
+destructors, GCC's attribute cleanup with the @-fexception@ flag is
 sufficient to handle unwinding.
 The only functions that require more than that are those that contain
+\codeCFA{try} statements. A \codeCFA{try} statement has a \codeCFA{try}
 clause, some number of \codeCFA{catch} clauses and \codeCFA{catchResume}
 clauses and may have a \codeCFA{finally} clause. Of these only \codeCFA{try}
 statements with \codeCFA{catch} clauses need to be transformed and only they
 and the \codeCFA{try} clause are involved.
+@try@ statements. A @try@ statement has a @try@
+clause, some number of @catch@ clauses and @catchResume@
+clauses and may have a @finally@ clause. Of these only @try@
+statements with @catch@ clauses need to be transformed and only they
+and the @try@ clause are involved.
 The \codeCFA{try} statement is converted into a series of closures which can
+The @try@ statement is converted into a series of closures which can
 access other parts of the function according to scoping rules but can be
 passed around. The \codeCFA{try} clause is converted into the try functions,
 almost entirely unchanged. The \codeCFA{catch} clauses are converted into two
+passed around. The @try@ clause is converted into the try functions,
+almost entirely unchanged. The @catch@ clauses are converted into two
 functions; the match function and the catch function.
 …
 runs the handler's body.
 These three functions are passed to \codeC{try_terminate}. This is an
+These three functions are passed to @try_terminate@. This is an
 % Maybe I shouldn't quote that, it isn't its actual name.
 internal hand-written function that has its own personality function and
 …
 handler was found in this frame. If it was then the personality function
 installs the handler, which is setting the instruction pointer in
 \codeC{try_terminate} to an otherwise unused section that calls the catch
+@try_terminate@ to an otherwise unused section that calls the catch
 function, passing it the current exception and handler index.
 \codeC{try_terminate} returns as soon as the catch function returns.
+@try_terminate@ returns as soon as the catch function returns.
 At this point control has returned to normal control flow.

doc/theses/fangren_yu_COOP_F20/Report.tex

-              rb6a8b31
+              rd95969a
 \usepackage[usenames]{color}
 \input{common}                                          % common CFA document macros
 \usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
 \usepackage{breakurl}
 \urlstyle{sf}
 …
 \renewcommand{\subsectionmark}[1]{\markboth{\thesubsection\quad #1}{\thesubsection\quad #1}}
 \pagenumbering{roman}
 \linenumbers                                            % comment out to turn off line numbering
+%\linenumbers                                            % comment out to turn off line numbering
 \maketitle
 \pdfbookmark[1]{Contents}{section}
+\tableofcontents
+\clearpage
 \thispagestyle{plain}
 \pagenumbering{arabic}
 \begin{abstract}
+\CFA is an evolutionary, non-object-oriented extension of the C programming language, featuring a parametric type-system, and is currently under active development. The reference compiler for the \CFA language, @cfa-cc@, has some of its major components dated back to the early 2000s, which are based on inefficient data structures and algorithms. This report introduces improvements targeting the expression resolution algorithm, suggested by a recent prototype experiment on a simplified model, which are implemented in @cfa-cc@ to support the full \CFA language. These optimizations speed up the compiler by a factor of 20 across the existing \CFA codebase, bringing the compilation time of a mid-sized \CFA source file down to the 10-second level. A few problem cases derived from realistic code examples are analyzed in detail, with proposed solutions. This work is a critical step in the \CFA project development to achieve its eventual goal of being used alongside C for large software systems.
 \end{abstract}
+\clearpage
+\section*{Acknowledgements}
+\begin{sloppypar}
+I would like to thank everyone in the \CFA team for their contribution towards this project. Programming language design and development is a tough subject and requires a lot of teamwork. Without the collaborative efforts from the team, this project could not have been a success. Specifically, I would like to thank Andrew Beach for introducing me to the \CFA codebase, Thierry Delisle for maintaining the test and build automation framework, Michael Brooks for providing example programs of various experimental language and type system features, and most importantly, Professor Martin Karsten for recommending me to the \CFA team, and my supervisor, Professor Peter Buhr for encouraging me to explore deeply into intricate compiler algorithms. Finally, I gratefully acknowledge the help from Aaron Moss, former graduate from the team and the author of the precedent thesis work, to participate in the \CFA team's virtual conferences and email correspondence, and provide many critical arguments and suggestions. 2020 had been an unusually challenging year for everyone and we managed to keep a steady pace.
+\end{sloppypar}
+\clearpage
+\tableofcontents
+\clearpage
 \section{Introduction}
+\section{Completed work}
+\CFA language, developed by the Programming Language Group at the University of Waterloo, has a long history, with the initial language design in 1992 by Glen Ditchfield~\cite{Ditchfield92} and the first proof-of-concept compiler built in 2003 by Richard Bilson~\cite{Bilson03}. Many new features have been added to the language over time, but the core of \CFA's type-system --- parametric functions introduced by the @forall@ clause (hence the name of the language) providing parametric overloading --- remains mostly unchanged.
+The current \CFA reference compiler, @cfa-cc@, is designed using the visitor pattern~\cite{vistorpattern} over an abstract syntax tree (AST), where multiple passes over the AST modify it for subsequent passes. @cfa-cc@ still includes many parts taken directly from the original Bilson implementation, which served as the starting point for this enhancement work to the type system. Unfortunately, the prior implementation did not provide the efficiency required for the language to be practical: a \CFA source file of approximately 1000 lines of code can take a multiple minutes to compile. The cause of the problem is that the old compiler used inefficient data structures and algorithms for expression resolution, which involved significant copying and redundant work.
+This report presents a series of optimizations to the performance-critical parts of the resolver, with a major rework of the compiler data-structures using a functional-programming approach to reduce memory complexity. The improvements were suggested by running the compiler builds with a performance profiler against the \CFA standard-library source-code and a test suite to find the most underperforming components in the compiler algorithm.
+The \CFA team endorses a pragmatic philosophy that focuses on practical implications of language design and implementation rather than theoretical limits. In particular, the compiler is designed to be expressive with respect to code reuse while maintaining type safety, but compromise theoretical soundness in extreme corner cases. However, when these corner cases do appear in actual usage, they need to be thoroughly investigated. A case-by-case analysis is presented for several of these corner cases, some of which point to certain weaknesses in the language design with solutions proposed based on experimental results.
+\section{AST restructuring}
 \subsection{Memory model with sharing}
+A major rework of the abstract syntax tree (AST) data structure in the compiler is completed as the first step of the project. The majority of work were documented in the reference manual of the compiler~\cite{cfa-cc}. To summarize:
+\begin{itemize}
+\item
+AST nodes (and therefore subtrees) can be shared without copying when reused.
+\item
+Modifications apply the functional programming principle, making copies for local changes without affecting the original data shared by other owners. In-place mutations are permitted as a special case when sharing does not happen. The logic is implemented by reference counting.
+\item
+Memory allocation and freeing are performed automatically using smart pointers.
+\end{itemize}
+The resolver algorithm designed for overload resolution naturally introduces a significant amount of reused intermediate representations, especially in the following two places:
+\begin{itemize}
+\item
+Function overload candidates are computed by combining the argument candidates bottom-up, with many of them being a common term. For example, if $n$ overloads of a function @f@ all take an integer for the first parameter but different types for the second (@f( int, int )@, @f( int, double )@, etc.) the first term is reused $n$ times for each of the generated candidate expressions. This effect is particularly bad for deep expression trees.
+\item
+In the unification algorithm and candidate elimination step, actual types are obtained by substituting the type parameters by their bindings. Let $n$ be the complexity (\ie number of nodes in representation) of the original type, $m$ be the complexity of bound type for parameters, and $k$ be the number of occurrences of type parameters in the original type. If everything needs to be deep-copied, the substitution step takes $O(n+mk)$ time and memory, while using shared nodes it is reduced to $O(n)$ time and $O(k)$ memory.
+\end{itemize}
+One of the worst examples for the old compiler is a long chain of I/O operations
+\begin{cfa}
+sout | 1 | 2 | 3 | 4 | ...
+\end{cfa}
+The pipe operator is overloaded by \CFA I/O library for every primitive type in C language, as well as I/O manipulators defined by the library. In total there are around 50 overloads for the output stream operation. On resolving the $n$-th pipe operator in the sequence, the first term, which is the result of sub-expression containing $n-1$ pipe operators, is reused to resolve every overload. Therefore at least $O(n^2)$ copies of expression nodes are made during resolution, not even counting type unification cost; combined with two large factors from number of overloads of pipe operators, and that the ``output stream type'' in \CFA is a trait with 27 assertions (which adds to complexity of the pipe operator's type) this makes compiling a long output sequence extremely slow. In new AST representation only $O(n)$ copies are required and type of pipe operator is not copied at all.
+Reduction in space complexity is especially important, as preliminary profiling result on the old compiler build shows that over half of time spent in expression resolution are on memory allocations.
+A major rework of the AST data-structure in the compiler was completed as the first step of the project. The majority of this work is documented in my prior report documenting the compiler reference-manual~\cite{cfa-cc}. To summarize:
+\begin{itemize}
+\item
+AST nodes (and therefore subtrees) can be shared without copying.
+\item
+Modifications are performed using functional-programming principles, making copies for local changes without affecting the original data shared by other owners. In-place mutations are permitted as a special case when there is no sharing. The logic is implemented by reference counting.
+\item
+Memory allocation and freeing are performed automatically using smart pointers~\cite{smartpointers}.
+\end{itemize}
+The resolver algorithm, designed for overload resolution, uses a significant amount of reused, and hence copying, for the intermediate representations, especially in the following two places:
+\begin{itemize}
+\item
+Function overload candidates are computed by combining the argument candidates bottom-up, with many being a common term. For example, if $n$ overloads of a function @f@ all take an integer for the first parameter but different types for the second, \eg @f( int, int )@, @f( int, double )@, etc., the first term is copied $n$ times for each of the generated candidate expressions. This copying is particularly bad for deep expression trees.
+\item
+In the unification algorithm and candidate elimination step, actual types are obtained by substituting the type parameters by their bindings. Let $n$ be the complexity (\ie number of nodes in representation) of the original type, $m$ be the complexity of the bound type for parameters, and $k$ be the number of occurrences of type parameters in the original type. If every substitution needs to be deep-copied, these copy step takes $O(n+mk)$ time and memory, while using shared nodes it is reduced to $O(n)$ time and $O(k)$ memory.
+\end{itemize}
+One of the worst examples for the old compiler is a long chain of I/O operations:
+\begin{cfa}
+sout | 1 | 2 | 3 | 4 | ...;   // print integer constants
+\end{cfa}
+The pipe operator is overloaded by the \CFA I/O library for every primitive type in the C language, as well as I/O manipulators defined by the library. In total, there are around 50 overloads for the output stream operation. On resolving the $n$-th pipe operator in the sequence, the first term, which is the result of sub-expression containing $n-1$ pipe operators, is reused to resolve every overload. Therefore at least $O(n^2)$ copies of expression nodes are made during resolution, not even counting type unification cost; combined with the two large factors from number of overloads of pipe operators, and that the ``output stream type'' in \CFA is a trait with 27 assertions (which adds to complexity of the pipe operator's type) this makes compiling a long output sequence extremely slow. In the new AST representation, only $O(n)$ copies are required and the type of the pipe operator is not copied at all.
+Reduction in space complexity is especially important, as preliminary profiling results on the old compiler build showed over half of the time spent in expression resolution is on memory allocations.
+Since the compiler codebase is large and the new memory model mostly benefits expression resolution, some of the old data structures are still kept, and a conversion pass happens before and after the general resolve phase. Rewriting every compiler module will take longer, and whether the new model is correct was unknown when this project started, therefore only the resolver is currently implemented with the new data structure.
 \subsection{Merged resolver calls}
 The pre-resolve phase of compilation, inadequately called ``validate'' in the compiler source code, does more than just simple syntax validation, as it also normalizes input program. Some of them, however, requires type information on expressions and therefore needs to call the resolver before the general resolve phase. There are three notable places where the resolver is invoked:
 \begin{itemize}
 \item
+Attempt to generate default constructor, copy constructor and destructor for user-defined @struct@ types
 \item
 Resolve @with@ statements (the same as in Python, which introduces fields of a structure directly in scope)
+The pre-resolve phase of compilation, inappropriately called ``validate'' in the compiler source code, has a number of passes that do more than simple syntax and semantic validation; some passes also normalizes the input program. A few of these passes require type information for expressions, and therefore, need to call the resolver before the general resolve phase. There are three notable places where the resolver is invoked:
+\begin{itemize}
+\item
+Generate default constructor, copy constructor and destructor for user-defined @struct@ types.
+\item
+Resolve @with@ statements (the same as in Pascal~\cite{pascal}), which introduces fields of a structure directly into a scope.
 \item
 Resolve @typeof@ expressions (cf. @decltype@ in \CC); note that this step may depend on symbols introduced by @with@ statements.
 \end{itemize}
+Since the compiler codebase is large and the new memory model mostly only benefits expression resolution, the old data structure is still kept, and a conversion pass happens before and after resolve phase. Rewriting every compiler module will take a long time, and whether the new model is correct is still unknown when started, therefore only the resolver is implemented with the new data structure.
+Since the constructor calls were one of the most expensive to resolve (reason will be shown in the next section), pre-resolve phase were taking more time after resolver moves to the more efficient new implementation. To better facilitate the new resolver, every step that requires type information are reintegrated as part of resolver.
+A by-product of this work is that the reversed dependence of @with@ statement and @typeof@ can now be handled. Previously, the compiler is unable to handle cases such as
+Since the constructor calls are one of the most expensive to resolve (reason given in~\VRef{s:SpecialFunctionLookup}), this pre-resolve phase was taking a large amount of time even after the resolver was changed to the more efficient new implementation. The problem is that multiple resolutions repeat a significant amount of work. Therefore, to better facilitate the new resolver, every step that requires type information should be integrated as part of the general resolver phase.
+A by-product of this work is that reversed dependence between @with@ statement and @typeof@ can now be handled. Previously, the compiler was unable to handle cases such as:
 \begin{cfa}
 struct S { int x; };
 S foo();
 typeof( foo() ) s; // type is S
 with (s) {
+with (s) {
         x; // refers to s.x
+}
 \end{cfa}
 since type of @s@ is still unresolved when handling @with@ expressions. Instead, the new (and correct) approach is to evaluate @typeof@ expressions when the declaration is first seen, and it suffices because of the declaration-before-use rule.
+since the type of @s@ is unresolved when handling @with@ expressions because the @with@ pass follows the @typeof@ pass (interchanging passes only interchanges the problem). Instead, the new (and correct) approach is to evaluate @typeof@ expressions when the declaration is first seen during resolution, and it suffices because of the declaration-before-use rule.
 \subsection{Special function lookup}
+Reducing the number of functions looked up for overload resolution is an effective way to gain performance when there are many overloads but most of them are trivially wrong. In practice, most functions have few (if any) overloads but there are notable exceptions. Most importantly, constructor @?{}@, destructor @^?{}@, and assignment @?=?@ are generated for every user-defined type, and in a large source file there can be hundreds of them. Furthermore, many calls to them are generated for initializing variables and passing arguments. This fact makes them the most overloaded and most called functions.
+In an object-oriented programming language, object has methods declared with their types, so a call such as @obj.f()@ only needs to perform lookup in the method table corresponding to type of @obj@. \CFA on the other hand, does not have methods, and all types are open (\ie new operations can be defined on them), so a similar approach will not work in general. However, the ``big 3'' operators have a unique property enforced by the language rules, such that the first parameter must have a reference type. Since \CFA does not have class inheritance, reference type must always match exactly. Therefore, argument-dependent lookup can be implemented for these operators, by using a dedicated symbol table.
+The lookup key used for the special functions is the mangled type name of the first parameter, which acts as the @this@ parameter in an object-oriented language. To handle generic types, the type parameters are stripped off, and only the base type is matched. Note that a constructor (destructor, assignment operator) taking arbitrary @this@ argument, for example @forall( dtype T ) void ?{}( T & );@ is not allowed, and it guarantees that if the @this@ type is known, all possible overloads can be found by searching with the given type. In case that the @this@ argument itself is overloaded, it is resolved first and all possible result types are used for lookup.
+Note that for the generated expressions, the particular variable for @this@ argument is fully known, without overloads, so the majority of constructor call resolutions only need to check for one given object type. Explicit constructor calls and assignment statements sometimes may require lookup for multiple types. In the extremely rare case that type of @this@ argument is yet unbound, everything will have to be checked, just like without the argument-dependent lookup algorithm; fortunately, this case almost never happens in practice. An example is found in the library function @new@:
+\label{s:SpecialFunctionLookup}
+Reducing the number of function looked ups for overload resolution is an effective way to gain performance when there are many overloads but most of them are trivially wrong. In practice, most functions have few (if any) overloads but there are notable exceptions. Most importantly, constructor @?{}@, destructor @^?{}@, and assignment @?=?@ are generated for every user-defined type (@struct@ and @union@ in C), and in a large source file there can be hundreds of them. Furthermore, many calls are generated for initializing variables, passing arguments and copying values. This fact makes them the most overloaded and most called functions.
+In an object-oriented programming language, the object-method types are scoped within a class, so a call such as @obj.f()@ only needs to perform lookup in the method table corresponding to the type of @obj@. \CFA on the other hand, does not have methods, and all types are open, \ie new operations can be defined on them without inheritance; at best a \CFA type can be constrained by a translation unit. However, the ``big 3'' operators have a unique property enforced by the language rules: the first parameter must be a reference to its associated type, which acts as the @this@ parameter in an object-oriented language. Since \CFA does not have class inheritance, the reference type must always match exactly. Therefore, argument-dependent lookup can be implemented for these operators by using a dedicated, fast symbol-table.
+The lookup key for the special functions is the mangled type name of the first parameter. To handle generic types, the type parameters are stripped off, and only the base type is matched. Note a constructor (destructor, assignment operator) may not take an arbitrary @this@ argument, \eg @forall( dtype T ) void ?{}( T & )@, thus guaranteeing that if the @this@ type is known, all possible overloads can be found by searching with this given type. In the case where the @this@ argument itself is overloaded, it is resolved first and all possible result types are used for lookup.
+Note that for a generated expression, the particular variable for the @this@ argument is fully known, without overloads, so the majority of constructor-call resolutions only need to check for one given object type. Explicit constructor calls and assignment statements sometimes require lookup for multiple types. In the extremely rare case that the @this@-argument type is unbound, all necessary types are guaranteed to be checked, as for the previous lookup without the argument-dependent lookup; fortunately, this complex case almost never happens in practice. An example is found in the library function @new@:
 \begin{cfa}
 forall( dtype T | sized( T ), ttype TT | { void ?{}( T &, TT ); } )
 T * new( TT p ) { return &(*malloc()){ p }; }
 \end{cfa}
 as @malloc@ may return a pointer to any type, depending on context.
 Interestingly, this particular line of code actually caused another complicated issue, where the unusually massive work of checking every constructor in presence makes the case even worse. Section~\ref{s:TtypeResolutionInfiniteRecursion} presents a detailed analysis for the problem.
 The ``callable'' operator @?()@ (cf. @operator()@ in \CC) could also be included in the special operator list, as it is usually only on user-defined types, and the restriction that first argument must be a reference seems reasonable in this case.
+as @malloc@ may return a pointer to any type, depending on context.
+Interestingly, this particular declaration actually causes another complicated issue, making the complex checking of every constructor even worse. \VRef[Section]{s:TtypeResolutionInfiniteRecursion} presents a detailed analysis of this problem.
+The ``callable'' operator @?()@ (cf. @operator()@ in \CC) can also be included in this special operator list, as it is usually only on user-defined types, and the restriction that the first argument must be a reference seems reasonable in this case.
 \subsection{Improvement of function type representation}
+Since substituting type parameters with their bound types is one fundamental operation in many parts of resolver algorithm (particularly unification and environment binding), making as few copies of type nodes as possible helps reducing memory complexity. Even with the new memory management model, allocation is still a significant factor of resolver performance. Conceptually, operations on type nodes of AST should be performed in functional programming style, treating the data structure as immutable and only copy when necessary. The in-place mutation is a mere optimization that does not change logic of operations.
+The model was broken on function types by an inappropriate design. Function types require some special treatment due to the existence of assertions. In particular, it must be able to distinguish two different kinds of type parameter usage:
+Since substituting type parameters with their bound types is one fundamental operation in many parts of resolver algorithm (particularly unification and environment binding), making as few copies of type nodes as possible helps reducing memory complexity. Even with the new memory management model, allocation is still a significant factor of resolver performance. Conceptually, operations on type nodes of the AST should be performed in functional-programming style, treating the data structure as immutable and only copying when necessary. The in-place mutation is a mere optimization that does not change the logic for operations.
+However, the model was broken for function types by an inappropriate design. Function types require special treatment due to the existence of assertions that constrain the types it supports. Specifically, it must be possible to distinguish two different kinds of type parameter usage:
 \begin{cfa}
 forall( dtype T ) void foo( T * t ) {
         forall( dtype U ) void bar( T * t, U * u ) { ... }
+}
 \end{cfa}
 Here, only @U@ is a free parameter in declaration of @bar@, as it appears in the function's own forall clause; while @T@ is not free.
 Moreover, the resolution algorithm also has to distinguish type bindings of multiple calls to the same function, for example with
+        forall( dtype U ) void bar( @T@ * t, @U@ * u ) { ... }
+}
+\end{cfa}
+Here, only @U@ is a free parameter in the nested declaration of function @bar@, as @T@ must be bound at the call site when resolving @bar@.
+Moreover, the resolution algorithm also has to distinguish type bindings of multiple calls to the same function, \eg:
 \begin{cfa}
 forall( dtype T ) int foo( T x );
 foo( foo( 1.0 ) );
 \end{cfa}
 The inner call has binding (T: double) while the outer call has binding (T: int). Therefore a unique representation of free parameters in each expression is required. This was previously done by creating a copy of the parameter declarations inside function type, and fixing references afterwards. However, fixing references is an inherently deep operation that does not work well with functional programming model, as it must be evaluated eagerly on the entire syntax tree representing the function type.
 The revised approach generates a unique ID value for each function call expression instance and represents an occurrence of free parameter type with a pair of generated ID and the original parameter declaration, so that references do not need to be fixed, and a shallow copy of function type is possible.
 Note that after the change, all declaration nodes in syntax tree representation maps one-to-one with the actual declarations in the program, and therefore are guaranteed to be unique. Such property can potentially enable more optimizations, and some related ideas are presented after Section~\ref{s:SharedSub-ExpressionCaseUniqueExpressions}.
+int i = foo( foo( 1.0 ) );
+\end{cfa}
+The inner call has binding (T: double) while the outer call has binding (T: int). Therefore a unique representation for the free parameters is required in each expression. This type binding was previously done by creating a copy of the parameter declarations inside the function type and fixing references afterwards. However, fixing references is an inherently deep operation that does not work well with the functional-programming style, as it forces eager evaluation on the entire syntax tree representing the function type.
+The revised approach generates a unique ID value for each function call expression instance and represents an occurrence of a free-parameter type with a pair of generated ID and original parameter declaration, so references are unique and a shallow copy of the function type is possible.
+Note that after the change, all declaration nodes in the syntax-tree representation now map one-to-one with the actual declarations in the program, and therefore are guaranteed to be unique. This property can potentially enable more optimizations, and some related ideas are presented at the end of \VRef{s:SharedSub-ExpressionCaseUniqueExpressions}.
 \subsection{Improvement of pruning steps}
 A minor improvement for candidate elimination is to skip the step on the function overloads themselves and only perform on results of function application. As function calls are usually by name, the name resolution rule dictates that every function candidate necessarily has a different type; indirect function calls are rare, and when they do appear, they usually will not have many possible interpretations, and those rarely matches exactly in argument type. Since function types have a much more complex representation than data types (with multiple parameters and assertions), checking equality on them also takes longer.
 A brief test of this approach shows that the number of function overloads considered in expression resolution increases by a negligible amount of less than 1 percent, while type comparisons in candidate elimination are cut by more than half. Improvement is consistent over all \CFA source files in the test suite.
+A minor improvement for candidate elimination is to skip the step on the function overloads and only check the results of function application. As function calls are usually by name (versus pointers to functions), the name resolution rule dictates that every function candidate necessarily has a different type; indirect function calls are rare, and when they do appear, there are even fewer cases with multiple interpretations, and these rarely match exactly in argument type. Since function types have a much more complex representation (with multiple parameters and assertions) than data types, checking equality on them also takes longer.
+A brief test of this approach shows that the number of function overloads considered in expression resolution increases by an amount of less than 1 percent, while type comparisons in candidate elimination are reduced by more than half. This improvement is consistent over all \CFA source files in the test suite.
 …
 \label{s:SharedSub-ExpressionCaseUniqueExpressions}
 Unique expression denotes an expression that must be evaluated only once, to prevent unwanted side effects. It is currently only a compiler artifact, generated on tuple member expression of the form
+Unique expression denotes an expression evaluated only once to prevent unwanted side effects. It is currently only a compiler artifact, generated for tuple-member expression of the form:
 \begin{cfa}
 struct S { int a; int b; };
 …
 s.[a, b]; // tuple member expression, type is [int, int]
 \end{cfa}
 If the aggregate expression contains function calls, it cannot be evaluated multiple times:
+If the aggregate expression is function call, it cannot be evaluated multiple times:
 \begin{cfa}
 S makeS();
 makeS().[a, b]; // this should only make one S
+makeS().[a, b]; // this should only generate a unique S
 \end{cfa}
 Before code generation, the above expression is internally represented as
 …
 \end{cfa}
 at code generation, where @_unique_var@ and @_unique_var_evaluated@ are generated variables whose scope covers all appearances of the same expression.
+Note that although the unique expression is only used for tuple expansion now, it is a generally useful construction, and can be seen in other languages, such as Scala's @lazy val@~\cite{Scala}; therefore it could be worthwhile to introduce the unique expression to a broader context in \CFA and even make it directly available to programmers.
+In the compiler's visitor pattern, however, this creates a problem where multiple paths to a logically unique expression exist, so it may be modified more than once and become ill-formed; some specific intervention is required to ensure that unique expressions are only visited once. Furthermore, a unique expression appearing in more than one places will be copied on mutation so its representation is no longer unique. Some hacks are required to keep it in sync, and the methods are different when mutating the unique expression instance itself or its underlying expression.
+Example when mutating the underlying expression (visit-once guard)
+The conditional check ensures a single call to @makeS()@ even though there are logically multiple calls because of the tuple field expansion.
+Note that although the unique expression is only used for tuple expansion now, it is a generally useful construction, and is seen in other programming languages, such as Scala's @lazy val@~\cite{Scala}; therefore it may be worthwhile to introduce the unique expression to a broader context in \CFA and even make it directly available to programmers.
+In the compiler's visitor pattern, however, this creates a problem where multiple paths to a logically unique expression exist, so it may be modified more than once and become ill-formed; some specific intervention is required to ensure unique expressions are only visited once. Furthermore, a unique expression appearing in more than one places is copied on mutation so its representation is no longer unique.
+Currently, special cases are required to keep everything synchronized, and the methods are different when mutating the unique expression instance itself or its underlying expression:
+\begin{itemize}
+\item
+When mutating the underlying expression (visit-once guard)
 \begin{cfa}
 void InsertImplicitCalls::previsit( const ast::UniqueExpr * unqExpr ) {
         if ( visitedIds.count( unqExpr->id ) ) visit_children = false;
+        @if ( visitedIds.count( unqExpr->id ) ) visit_children = false;@
         else visitedIds.insert( unqExpr->id );
+}
 \end{cfa}
+Example when mutating the unique instance itself, which actually creates copies
+\item
+When mutating the unique instance itself, which actually creates copies
 \begin{cfa}
 auto mutExpr = mutate( unqExpr ); // internally calls copy when shared
+if ( ! unqMap.count( unqExpr->id ) ) {
+@if ( ! unqMap.count( unqExpr->id ) ) {@
         ...
 } else {
 …
+}
 \end{cfa}
+Such workaround seems difficult to be fit into a common visitor template. This suggests the memory model may need different kinds of nodes to accurately represent the syntax tree.
+Together with the fact that declaration nodes are always unique, it is possible that AST nodes can be classified by three different types:
+\begin{itemize}
+\item
+\textbf{Strictly unique} with only one owner (declarations);
+\item
+\textbf{Logically unique} with (possibly) many owners but should not be copied (unique expression example presented here);
+\item
+\textbf{Shared} by functional programming model, which assume immutable data structure and are copied on mutation.
+\end{itemize}
+Such workarounds are difficult to fit into the common visitor pattern, which suggests the memory model may need different kinds of nodes to accurately represent this feature in the AST.
+Given that declaration nodes are unique, it is possible for AST nodes to be divided into three different types:
+\begin{itemize}
+\item
+\textbf{Singleton} with only one owner (declarations);
+\item
+\textbf{No-copy} with multiple owners but cannot be copied (unique expression example presented here);
+\item
+\textbf{Copy} by functional-programming style, which assumes immutable data structures that are copied on mutation.
 \end{itemize}
 The boilerplate code can potentially handle these three cases differently.
 …
 \section{Analysis of resolver algorithm complexity}
 The focus of this chapter is to identify and analyze some realistic cases that cause resolver algorithm to have an exponential run time. As previous work has shown [3], the overload resolution problem in \CFA has worst-case exponential complexity; however, only few specific patterns can trigger the exponential complexity in practice. Implementing heuristic-based optimization for those selected cases is helpful to alleviate the problem.
+The focus of this section is to identify and analyze some realistic cases that cause the resolver algorithm to have an exponential runtime. As previous work has shown~\cite[\S~4.2.1]{Moss19}, the overload resolution problem in \CFA has worst-case exponential complexity; however, only few specific patterns can trigger the exponential complexity in practice. Implementing heuristic-based optimization for those selected cases is helpful to alleviate the problem.
 …
 \label{s:UnboundReturnType}
 The interaction of return type overloading and polymorphic functions creates this problem of function calls with unbound return type, and is further complicated by the presence of assertions.
+The interaction of return-type overloading and polymorphic functions creates function calls with unbounded return-type, and is further complicated by the presence of assertions.
 The prime example of a function with unbound return type is the type-safe version of C @malloc@:
 \begin{cfa}
+// size deduced from type, so no need to provide the size argument
+forall( dtype T | sized( T ) ) T * malloc( void );
+\end{cfa}
+Unbound return type can be problematic in resolver algorithm complexity because a single match of function call with unbound return type may create multiple candidates. In the worst case, consider a function declared to return any @otype@:
+forall( dtype T | sized( T ) )
+T * malloc( void ) { return (T *)malloc( sizeof(T) ); } // call C malloc
+int * i = malloc();  // type deduced from left-hand size $\Rightarrow$ no size argument or return cast
+\end{cfa}
+An unbound return-type is problematic in resolver complexity because a single match of a function call with an unbound return type may create multiple candidates. In the worst case, consider a function declared that returns any @otype@ (defined \VPageref{otype}):
 \begin{cfa}
 forall( otype T ) T anyObj( void );
 \end{cfa}
 As the resolver attempts to satisfy the otype constraint on @T@, a single call to @anyObj()@ without the result type known creates at least as many candidates as the number of complete types currently in scope; with generic types it becomes even worse, for example, assuming a declaration of generic pair is available at that point:
+As the resolver attempts to satisfy the otype constraint on @T@, a call to @anyObj()@ in an expression, without the result type known, creates at least as many candidates as the number of complete types currently in scope; with generic types it becomes even worse, \eg assuming a declaration of a generic @pair@ is available at that point:
 \begin{cfa}
 forall( otype T, otype U ) struct pair { T first; U second; };
 \end{cfa}
 Then an @anyObj()@ call can result in arbitrarily complex types, such as @pair( pair( int,int ), pair( int,int ) )@, and the depth can grow indefinitely until the specified parameter depth limit, thus creating exponentially many candidates. However, the expected types allowed by parent expressions are practically very few, so most of those interpretations are invalid; if the result type is never bound up to top level, by the semantic rules it is ambiguous if there are more than one valid bindings, and resolution can fail fast. It is therefore reasonable to delay resolving assertions on an unbound parameter in return type; however, with the current cost model, such behavior may further cause irregularities in candidate selection, such that the presence of assertions can change the preferred candidate, even when order of expression costs are supposed to stay the same. Detailed analysis of this issue will be presented later, in the correctness part.
+Then an @anyObj()@ call can result in arbitrarily complex types, such as @pair( pair( int, int ), pair( int, int ) )@, and the depth can grow indefinitely until a specified parameter-depth limit, thus creating exponentially many candidates. However, the expected types allowed by parent expressions are practically very few, so most of those interpretations are invalid; if the result type is never bound up to the top level, by the semantic rules it is ambiguous if there is more than one valid binding and resolution fails quickly. It is therefore reasonable to delay resolving assertions on an unbound parameter in a return type; however, with the current cost model, such behavior may further cause irregularities in candidate selection, such that the presence of assertions can change the preferred candidate, even when order of expression costs are supposed to stay the same. A detailed analysis of this issue is presented in \VRef{s:AnalysisTypeSystemCorrectness}.
 …
 \label{s:TtypeResolutionInfiniteRecursion}
 @ttype@ (``tuple type'') is a relatively new addition to the language that attempts to provide type-safe variadic argument semantics. Unlike regular @dtype@ parameters, @ttype@ is only valid in function parameter list, and may only appear once as the type of last parameter. At the call site, a @ttype@ parameter is bound to the tuple type of all remaining function call arguments.
+@ttype@ (``tuple type'') is a relatively new addition to the language that attempts to provide type-safe variadic argument semantics. Unlike regular @dtype@ parameters, @ttype@ is only valid in a function parameter-list, and may only appear once as the last parameter type. At the call site, a @ttype@ parameter is bound to the tuple type of all remaining function-call arguments.
 There are two kinds of idiomatic @ttype@ usage: one is to provide flexible argument forwarding, similar to the variadic template in \CC (\lstinline[language=C++]|template<typename... args>|), as shown below in the implementation of @unique_ptr@
 …
         T * data;
 };
 forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); })
 void ?{}( unique_ptr( T ) & this, Args args ) {
         this.data = new( args );
+}
 \end{cfa}
 the other is to implement structural recursion in the first-rest manner:
 \begin{cfa}
 forall( otype T, ttype Params | { void process( T ); void func( Params ); })
+forall( dtype T | sized( T ), @ttype Args@ | { void ?{}( T &, Args ); })
+void ?{}( unique_ptr( T ) & this, Args @args@ ) {
+        this.data = new( @args@ );  // forward constructor arguments to dynamic allocator
+}
+\end{cfa}
+The other usage is to implement structural recursion in the first-rest pattern:
+\begin{cfa}
+forall( otype T, @ttype Params@ | { void process( T ); void func( Params ); })
 void func( T arg1, Params p ) {
         process( arg1 );
+        func( p );
+}
+\end{cfa}
+For the second use case, it is important that the number of parameters in the recursive call go down, since the call site must deduce all assertion candidates, and that is only possible if by just looking at argument types (and not their values), the recursion is known to be completed in a finite number of steps.
+In recent experiments, however, some flaw in the type binding rules can lead to the first kind of @ttype@ use case produce an invalid candidate that the resolver enters an infinite loop.
+This bug was discovered in an attempt to raise assertion recursive depth limit and one of the library program takes exponentially longer time to compile. The cause of the problem is identified to be the following set of functions.
+File @memory.cfa@ contains
+\begin{cfa}
+#include "memory.hfa"
+#include "stdlib.hfa"
+\end{cfa}
+where file @memory.hfa@ contains the @unique_ptr@ declaration above, and two other similar functions with @ttype@ parameter:
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); }) {
+        func( @p@ );  // recursive call until base case of one argument
+}
+\end{cfa}
+For the second use case, it is imperative the number of parameters in the recursive call goes down, since the call site must deduce all assertion candidates, and that is only possible if by observation of the argument types (and not their values), the recursion is known to be completed in a finite number of steps.
+In recent experiments, however, a flaw in the type-binding rules can lead to the first kind of @ttype@ use case producing an invalid candidate and the resolver enters an infinite loop.
+This bug was discovered in an attempt to raise the assertion recursive-depth limit and one of the library programs took exponentially longer to compile. The cause of the problem is the following set of functions:
+\begin{cfa}
+// unique_ptr  declaration from above
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); } ) { // distribute forall clause
         void ?{}( counter_data( T ) & this, Args args );
         void ?{}( counter_ptr( T ) & this, Args args );
         void ?{}( unique_ptr( T ) & this, Args args );
+}
+\end{cfa}
+File @stdlib.hfa@ contains
+\begin{cfa}
 forall( dtype T | sized( T ), ttype TT | { void ?{}( T &, TT ); } )
+T * new( TT p ) { return &(*malloc()){ p }; }
+\end{cfa}
+In the expression @(*malloc()){p}@, the type of object being constructed is yet unknown, since the return type information is not immediately provided. That caused every constructor to be searched, and while normally a bound @ttype@ cannot be unified with any free parameter, it is possible with another free @ttype@. Therefore in addition to the correct option provided by assertion, 3 wrong options are examined, each of which again requires the same assertion, for an unknown base type T and @ttype@ arguments, and that becomes an infinite loop, until the specified recursion limit and resolution is forced to fail. Moreover, during the recursion steps, number of candidates grows exponentially, since there are always 3 options at each step.
+Unfortunately, @ttype@ to @ttype@ binding is necessary, to allow calling the function provided by assertion indirectly.
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); })
+void ?{}( unique_ptr( T ) & this, Args args ) { this.data = (T * )new( args ); }
+\end{cfa}
+Here the constructor assertion is used for the @new( args )@ call.
+T * new( TT p ) { return @&(*malloc()){ p };@ }
+\end{cfa}
+In the expression @(*malloc()){p}@, the type of the object being constructed is unknown, since the return-type information is not immediately available. That causes every constructor to be searched, and while normally a bound @ttype@ cannot be unified with any free parameter, it is possible with another free @ttype@. Therefore, in addition to the correct option provided by the assertion, 3 wrong options are examined, each of which again requires the same assertion, for an unknown base-type @T@ and @ttype@ argument, which becomes an infinite loop until the specified recursion limit and resolution is fails. Moreover, during the recursion steps, the number of candidates grows exponentially, since there are always 3 options at each step.
+Unfortunately, @ttype@ to @ttype@ binding is necessary, to allow indirectly calling a function provided in an assertion.
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { @void ?{}( T &, Args );@ })
+void ?{}( unique_ptr( T ) & this, Args args ) { this.data = (T *)@new( args )@; } // constructor call
+\end{cfa}
+Here the constructor assertion is used by the @new( args )@ call to indirectly call the constructor on the allocated storage.
 Therefore, it is hard, perhaps impossible, to solve this problem by tweaking the type binding rules. An assertion caching algorithm can help improve this case by detecting cycles in recursion.
 Meanwhile, without the caching algorithm implemented, some changes in the \CFA source code are enough to eliminate this problem, at least in the current codebase. Note that the issue only happens with an overloaded variadic function, which rarely appears in practice, since the idiomatic use cases are for argument forwarding and self-recursion. The only overloaded @ttype@ function so far discovered in all of \CFA standard library code is the constructor, and by utilizing the argument-dependent lookup process described in Section~\ref{s:UnboundReturnType}, adding a cast before constructor call gets rid of the issue.
 \begin{cfa}
 T * new( TT p ) { return &(*(T * )malloc()){ p }; }
+Meanwhile, without a caching algorithm implemented, some changes in the \CFA source code are enough to eliminate this problem, at least in the current codebase. Note that the issue only happens with an overloaded variadic function, which rarely appears in practice, since the idiomatic use cases are for argument forwarding and self-recursion. The only overloaded @ttype@ function so far discovered in all of \CFA standard library is the constructor, and by utilizing the argument-dependent lookup process described in \VRef{s:UnboundReturnType}, adding a cast before the constructor call removes the issue.
+\begin{cfa}
+T * new( TT p ) { return &(*@(T * )@malloc()){ p }; }
 \end{cfa}
 …
 \subsection{Reused assertions in nested generic type}
 The following test of deeply nested dynamic generic type reveals that locally caching reused assertions is necessary, rather than just a resolver optimization, because recomputing assertions can result in bloated generated code size:
+The following test of deeply nested, dynamic generic type reveals that locally caching reused assertions is necessary, rather than just a resolver optimization, because recomputing assertions can result in bloated generated code size:
 \begin{cfa}
 struct nil {};
 …
 int main() {
         #if   N==0
         nil x;
+        nil @x@;
         #elif N==1
         cons( size_t, nil ) x;
+        cons( size_t, nil ) @x@;
         #elif N==2
         cons( size_t, cons( size_t, nil ) ) x;
+        cons( size_t, cons( size_t, nil ) ) @x@;
         #elif N==3
         cons( size_t, cons( size_t, cons( size_t, nil ) ) ) x;
+        cons( size_t, cons( size_t, cons( size_t, nil ) ) ) @x@;
         // similarly for N=4,5,6
         #endif
+}
 \end{cfa}
 At the declaration of @x@, it is implicitly initialized by generated constructor call, whose signature is given by
+At the declaration of @x@, it is implicitly initialized by generated constructor call, with signature:
 \begin{cfa}
 forall( otype L, otype R ) void ?{}( cons( L, R ) & );
 \end{cfa}
+Note that the @otype@ constraint contains 4 assertions:
+where the @otype@ constraint contains the 4 assertions:\label{otype}
 \begin{cfa}
 void ?{}( L & ); // default constructor
 …
 L & ?=?( L &, L & ); // assignment
 \end{cfa}
+Now since the right hand side of outermost cons is again a cons, recursive assertions are required. When the compiler cannot cache and reuse already resolved assertions, it becomes a problem, as each of those 4 pending assertions again asks for 4 more assertions one level below. Without any caching, number of resolved assertions grows exponentially, while that is obviously unnecessary since there are only $n+1$ different types involved. Even worse, this causes exponentially many wrapper functions generated later at the codegen step, and results in huge compiled binary.
 \begin{table}[h]
+\begin{table}[htb]
+\centering
 \caption{Compilation results of nested cons test}
+\label{t:NestedConsTest}
 \begin{tabular}{|r|r|r|}
 \hline
 …
 \end{table}
+As the local functions are implemented by emitting executable code on the stack~\cite{gcc-nested-func}, it eventually means that compiled code also has exponential run time. This problem has evident practical implications, as nested collection types are frequently used in real production code.
+Now since the right hand side of outermost cons is again a cons, recursive assertions are required. \VRef[Table]{t:NestedConsTest} shows when the compiler does not cache and reuse already resolved assertions, it becomes a problem, as each of these 4 pending assertions again asks for 4 more assertions one level below. Without caching, the number of resolved assertions grows exponentially, which is unnecessary since there are only $n+1$ different types involved. Even worse, this problem causes exponentially many wrapper functions to be generated at the backend, resulting in a huge binary. As the local functions are implemented by emitting executable code on the stack~\cite{gcc-nested-func}, it means that compiled code also has exponential run time. This problem has practical implications, as nested collection types are frequently used in real production code.
 \section{Analysis of type system correctness}
+\label{s:AnalysisTypeSystemCorrectness}
 In Moss' thesis~\cite[\S~4.1.2,~p.~45]{Moss19}, the author presents the following example:
 …
 From the set of candidates whose parameter and argument types have been unified and whose assertions have been satisfied, those whose sub-expression interpretations have the smallest total cost of conversion are selected ... The total cost of conversion for each of these candidates is then calculated based on the implicit conversions and polymorphism involved in adapting the types of the sub-expression interpretations to the formal parameter types.
 \end{quote}
+With this model, the algorithm picks @g1@ in resolving the @f( g( 42 ) )@ call, which seems to be undesirable.
+There are further evidence that shows the Bilson model is fundamentally incorrect, following the discussion of unbound return type in Section~\ref{s:UnboundReturnType}. By the conversion cost specification, a binding from a polymorphic type parameter to a concrete type incurs a polymorphic cost of 1. It remains unspecified \emph{when} the type parameters should become bound. When the parameterized types appear in the function parameters, they can be deduced from the argument type, and there is no ambiguity. In the unbound return case, however, the binding may happen at any stage in expression resolution, therefore it is impossible to define a unique local conversion cost. Note that type binding happens exactly once per parameter in resolving the entire expression, so the global binding cost is unambiguously 1.
+As per the current compiler implementation, it does have a notable inconsistency in handling such case. For any unbound parameter that does \emph{not} come with an associated assertion, it remains unbound to the parent expression; for those that does however, they are immediately bound in the assertion resolution step, and concrete result types are used in the parent expressions.
+With this model, the algorithm picks @g1@ in resolving the @f( g( 42 ) )@ call, which is undesirable.
+There is further evidence that shows the Bilson model is fundamentally incorrect, following the discussion of unbound return type in \VRef{s:UnboundReturnType}. By the conversion-cost specification, a binding from a polymorphic type-parameter to a concrete type incurs a polymorphic cost of 1. It remains unspecified \emph{when} the type parameters should become bound. When the parameterized types appear in function parameters, they can be deduced from the argument type, and there is no ambiguity. In the unbound return case, however, the binding may happen at any stage in expression resolution, therefore it is impossible to define a unique local conversion cost. Note that type binding happens exactly once per parameter in resolving the entire expression, so the global binding cost is unambiguously 1.
+In the current compiler implementation, there is a notable inconsistency in handling this case. For any unbound parameter that does \emph{not} come with an associated assertion, it remains unbound to the parent expression; for those that do, however, they are immediately bound in the assertion resolution step, and concrete result types are used in the parent expressions.
 Consider the following example:
 \begin{cfa}
 …
 void h( int * );
 \end{cfa}
 The expression @h( f() )@ eventually has a total cost of 1 from binding (T: int), but in the eager resolution model, the cost of 1 may occur either at call to @f@ or at call to @h@, and with the assertion resolution triggering a binding, the local cost of @f()@ is (0 poly, 0 spec) with no assertions, but (1 poly, -1 spec) with an assertion:
 \begin{cfa}
 forall( dtype T | { void g( T * ); } ) T * f( void );
+The expression @h( f() )@ eventually has a total cost of 1 from binding (T: int), but in the eager-resolution model, the cost of 1 may occur either at the call to @f@ or at call to @h@, and with the assertion resolution triggering a binding, the local cost of @f()@ is (0 poly, 0 spec) with no assertions, but (1 poly, -1 spec) with an assertion:
+\begin{cfa}
+forall( dtype T | @{ void g( T * ); }@ ) T * f( void );
 void g( int * );
 void h( int * );
 \end{cfa}
 and that contradicts the principle that adding assertions should make expression cost lower. Furthermore, the time at which type binding and assertion resolution happens is an implementation detail of the compiler, but not a part of language definition. That means two compliant \CFA compilers, one performing immediate assertion resolution at each step, and one delaying assertion resolution on unbound types, can produce different expression costs and therefore different candidate selection, making the language rule itself partially undefined and therefore unsound. By the above reasoning, the updated cost model using global sum of costs should be accepted as the standard. It also allows the compiler to freely choose when to resolve assertions, as the sum of total costs is independent of that choice; more optimizations regarding assertion resolution can also be implemented.
+and that contradicts the principle that adding assertions should make expression cost lower. Furthermore, the time at which type binding and assertion resolution happens is an implementation detail of the compiler, not part of the language definition. That means two compliant \CFA compilers, one performing immediate assertion resolution at each step, and one delaying assertion resolution on unbound types, can produce different expression costs and therefore different candidate selection, making the language rule itself partially undefined, and therefore, unsound. By the above reasoning, the updated cost model using global sum of costs should be accepted as the standard. It also allows the compiler to freely choose when to resolve assertions, as the sum of total costs is independent of that choice; more optimizations regarding assertion resolution can also be implemented.
 \section{Timing results}
+For the timing results presented here, the \CFA compiler is built with gcc 9.3.0, and tested on a server machine running Ubuntu 20.04, 64GB RAM and 32-core 2.2 GHz CPU, results reported by the time command, and using only 8 cores in parallel such that the time is close to the case with 100% CPU utilization on a single thread.
+On the most recent build, the \CFA standard library (~1.3 MB of source code) compiles in 4 minutes 47 seconds total processor time (single thread equivalent), with the slowest file taking 13 seconds. The test suite (178 test cases, ~2.2MB of source code) completes within 25 minutes total processor time,\footnote{Including a few runtime tests; total time spent in compilation is approximately 21 minutes.} with the slowest file taking 23 seconds. In contrast, the library build on old compiler takes 85 minutes total, 5 minutes for the slowest file. Full test suite takes too long with old compiler build and is therefore not run, but the slowest test cases take approximately 5 minutes. Overall, the most recent build compared to old build in April 2020, before the project started, is consistently faster by a factor of 20.
+Additionally, 6 selected \CFA source files with distinct features from library and test suite are used to test compiler performance after each of the optimizations are implemented. Test files are from the most recent build and run through C preprocessor to eliminate the factor of header file changes. The selected tests are:
+\begin{itemize}
+\item
+@lib/fstream@ (112 KB)\footnote{File sizes are after preprocessing, with no line information (\lstinline|gcc -E -P|).}: implementation of I/O library
+For the timing results presented here, the \CFA compiler is built with gcc 9.3.0, and tested on a server machine running Ubuntu 20.04, 64GB RAM and 32-core 2.2 GHz CPU.
+Timing is reported by the @time@ command and an experiment is run using 8 cores, where each core is at 100\% CPU utilization.
+On the most recent build, the \CFA standard library ($\approx$1.3 MB of source code) compiles in 4 minutes 47 seconds total processor time (single thread equivalent), with the slowest file taking 13 seconds. The test suite (178 test cases, $\approx$2.2MB of source code) completes within 25 minutes total processor time,
+% PAB: I do not understand this footnote.
+%\footnote{Including a few runtime tests; total time spent in compilation is approximately 21 minutes.}
+with the slowest file taking 23 seconds. In contrast, the library build with the old compiler takes 85 minutes total, 5 minutes for the slowest file. The full test-suite takes too long with old compiler build and is therefore not run, but the slowest test cases take approximately 5 minutes. Overall, the most recent build compared to an old build is consistently faster by a factor of 20.
+Additionally, 6 selected \CFA source files with distinct features from the library and test suite are used to illustrate the compiler performance change after each of the implemented optimizations. Test files are from the most recent build and run through the C preprocessor to expand header file, perform macro expansions, but no line number information (@gcc -E -P@).
+\VRef[Table]{t:SelectedFileByCompilerBuild} shows the selected tests:
+\begin{itemize}
+\item
+@lib/fstream@ (112 KB)
 \item
 @lib/mutex@ (166 KB): implementation of concurrency primitive
 …
 @lib/stdlib@ (64 KB): type-safe wrapper to @void *@-based C standard library functions
 \item
 @test/ISO2@ (55 KB): application of I/O library
+@test/io2@ (55 KB): application of I/O library
 \item
 @test/thread@ (188 KB): application of threading library
 \end{itemize}
+The \CFA compiler builds are picked from git commit history that passed the test suite, and implement the optimizations incrementally:
+\begin{itemize}
+\item
+\#0 is the first working build of new AST data structure
+versus \CFA compiler builds picked from the git commit history that implement the optimizations incrementally:
+\begin{itemize}
+\item
+old resolver
+\item
+\#0 is the first working build of the new AST data structure
 \item
 \#1 implements special symbol table and argument-dependent lookup
 \item
+\#2 implements late assertion satisfaction
+\item
+\#3 implements revised function type representation
+\item
+\#4 skips pruning on expressions with function type (most recent build)
+\end{itemize}
+The old resolver with no memory sharing and none of the optimizations above is also tested.
+\begin{table}
+\#2 implements late assertion-satisfaction
+\item
+\#3 implements revised function-type representation
+\item
+\#4 skips pruning on expressions for function types (most recent build)
+\end{itemize}
+Reading left to right for a test shows the benefit of each optimization on the cost of compilation.
+\begin{table}[htb]
+\centering
 \caption{Compile time of selected files by compiler build, in seconds}
+\label{t:SelectedFileByCompilerBuild}
 \begin{tabular}{|l|r|r|r|r|r|r|}
 \hline
 …
 \end{table}
 \section{Conclusion}
 Over the course of 8 months of active research and development in \CFA type system and compiler algorithm, performance of the reference \CFA compiler, cfa-cc, has been greatly improved, allowing mid-sized \CFA programs to be compiled and built reasonably fast. As there are also ongoing efforts in the team on building a standard library, evaluating the runtime performance, and attempting to incorporate \CFA with existing software written in C, this project is especially meaningful for practical purposes.
 Analysis conducted in the project were based significantly on heuristics and practical evidence, as the theoretical bounds and average cases for the expression resolution problem differ. This approach was difficult at start to follow, with an unacceptably slow compiler, since running the program through debugger and validation tools (\eg @gdb@, @valgrind@) adds another order of magnitude to run time, which was already in minutes. However, near the end of the project, many significant improvements have already been made and new optimizations can be tested immediately. The positive feedback in development cycle benefits the \CFA team as a whole, more than just for the compiler optimizations.
 Some potential issues of the language that may happen frequently in practice have been identified. Due to the time constraint and complex nature of these problems, a handful of them remain unsolved, but some constructive proposals are made. Notably, introducing a local assertion cache in the resolver is a common solution for a few remaining problems, so that should be the focus of work soon.
 The \CFA team are planning on a public alpha release of the language as the compiler performance becomes promising, and other parts of the system, such as a standard library, are also being enhanced. Ideally, the remaining problems should be resolved before release, and the solutions will also be integral to drafting a formal specification.
+Over the course of 8 months of active research and development of the \CFA type system and compiler algorithms, performance of the reference \CFA compiler, cfa-cc, has been greatly improved. Now, mid-sized \CFA programs are compiled reasonably fast. Currently, there are ongoing efforts by the \CFA team to augment the standard library and evaluate its runtime performance, and incorporate \CFA with existing software written in C; therefore this project is especially meaningful for these practical purposes.
+Accomplishing this work was difficult. Analysis conducted in the project is based significantly on heuristics and practical evidence, as the theoretical bounds and average cases for the expression resolution problem differ. As well, the slowness of the initial compiler made attempts to understand why and where problems exist extremely difficult because both debugging and validation tools (\eg @gdb@, @valgrind@, @pref@) further slowed down compilation time. However, by the end of the project, I had found and fixed several significant problems and new optimizations are easier to introduce and test. The reduction in the development cycle benefits the \CFA team as a whole.
+Some potential issues of the language, which happen frequently in practice, have been identified. Due to the time constraint and complex nature of these problems, a handful of them remain unsolved, but some constructive proposals are made. Notably, introducing a local assertion cache in the resolver is a reasonable solution for a few remaining problems, so that should be the focus of future work.
+The \CFA team are planning on a public alpha release of the language as the compiler performance, given my recent improvements, is now useable. Other parts of the system, such as the standard library, have made significant gains due to the speed up in the development cycle. Ideally, the remaining problems should be resolved before release, and the solutions will also be integral to drafting a formal specification.
 \addcontentsline{toc}{section}{\refname}

doc/theses/fangren_yu_COOP_S20/Report.tex

rb6a8b31	rd95969a
17	17	\usepackage[usenames]{color}
18	18	\input{common} % common CFA document macros
19		\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,~~pagebackref=true,~~linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
	19	\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
20	20	\usepackage{breakurl}
21	21	\urlstyle{sf}

driver/cfa.cc

-              rb6a8b31
+              rd95969a
 // Created On       : Tue Aug 20 13:44:49 2002
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Nov 17 14:27:28 2020
 // Update Count     : 440
+// Last Modified On : Sat Jan 16 07:30:19 2021
+// Update Count     : 442
 //
 …
                 args[nargs++] = "-no-integrated-cpp";
                 args[nargs++] = "-Wno-deprecated";
+                args[nargs++] = "-Wno-strict-aliasing";                 // casting from one type to another
                 #ifdef HAVE_CAST_FUNCTION_TYPE
                 args[nargs++] = "-Wno-cast-function-type";

libcfa/prelude/builtins.c

-              rb6a8b31
+              rd95969a
 // type that wraps a pointer and a destructor-like function - used in generating implicit destructor calls for struct members in user-defined functions
 // Note: needs to occur early, because it is used to generate destructor calls during code generation
 forall(dtype T)
+forall(T &)
 struct __Destructor {
         T * object;
 …
 // defined destructor in the case that non-generated code wants to use __Destructor
 forall(dtype T)
+forall(T &)
 static inline void ^?{}(__Destructor(T) & x) {
         if (x.object && x.dtor) {
 …
 // easy interface into __Destructor's destructor for easy codegen purposes
 extern "C" {
         forall(dtype T)
+        forall(T &)
         static inline void __destroy_Destructor(__Destructor(T) * dtor) {
                 ^(*dtor){};
 …
 void abort( const char fmt[], ... ) __attribute__ (( format(printf, 1, 2), __nothrow__, __leaf__, __noreturn__ ));
 forall(dtype T)
+forall(T &)
 static inline T & identity(T & i) {
         return i;
 …
 static inline void ^?{}($generator &) {}
 trait is_generator(dtype T) {
+trait is_generator(T &) {
       void main(T & this);
       $generator * get_generator(T & this);
 };
 forall(dtype T | is_generator(T))
+forall(T & | is_generator(T))
 static inline T & resume(T & gen) {
         main(gen);
 …
 static inline {
         forall( dtype DT | { DT & ?+=?( DT &, one_t ); } )
+        forall( DT & | { DT & ?+=?( DT &, one_t ); } )
         DT & ++?( DT & x ) { return x += 1; }
         forall( dtype DT | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?+=?( DT &, one_t ); } )
+        forall( DT & | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?+=?( DT &, one_t ); } )
         DT & ?++( DT & x ) { DT tmp = x; x += 1; return tmp; }
         forall( dtype DT | { DT & ?-=?( DT &, one_t ); } )
+        forall( DT & | { DT & ?-=?( DT &, one_t ); } )
         DT & --?( DT & x ) { return x -= 1; }
         forall( dtype DT | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?-=?( DT &, one_t ); } )
+        forall( DT & | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?-=?( DT &, one_t ); } )
         DT & ?--( DT & x ) { DT tmp = x; x -= 1; return tmp; }
         forall( dtype DT | { int ?!=?( const DT &, zero_t ); } )
+        forall( DT & | { int ?!=?( const DT &, zero_t ); } )
         int !?( const DT & x ) { return !( x != 0 ); }
 } // distribution
 // universal typed pointer constant
 static inline forall( dtype DT ) DT * intptr( uintptr_t addr ) { return (DT *)addr; }
+static inline forall( DT & ) DT * intptr( uintptr_t addr ) { return (DT *)addr; }
 static inline forall( ftype FT ) FT * intptr( uintptr_t addr ) { return (FT *)addr; }
 …
 #define __CFA_EXP_OVERFLOW__()
 static inline forall( otype OT | { void ?{}( OT & this, one_t ); OT ?*?( OT, OT ); } ) {
+static inline forall( OT | { void ?{}( OT & this, one_t ); OT ?*?( OT, OT ); } ) {
         OT ?\?( OT ep, unsigned int y ) { __CFA_EXP__(); }
         OT ?\?( OT ep, unsigned long int y ) { __CFA_EXP__(); }

libcfa/prelude/prelude-gen.cc

rb6a8b31	rd95969a
159	159	int main() {
160	160	cout << "# 2 \"prelude.cfa\" // needed for error messages from this file" << endl;
161		cout << "trait sized(~~dtype T~~) {};" << endl;
	161	cout << "trait sized(T &) {};" << endl;
162	162
163	163	cout << "//////////////////////////" << endl;
…	…
264	264	for (auto cvq : qualifiersPair) {
265	265	for (auto is_vol : { " ", "volatile" }) {
266		cout << "forall(~~dtype DT~~) void ?{}(" << cvq.first << type << " * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
	266	cout << "forall(DT &) void ?{}(" << cvq.first << type << " * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
267	267	}
268	268	}
…	…
279	279	for (auto cvq : qualifiersSingle) {
280	280	for (auto is_vol : { " ", "volatile" }) {
281		cout << "forall(~~dtype DT~~) void ?{}(" << cvq << " DT" << " * " << is_vol << " &);" << endl;
	281	cout << "forall(DT &) void ?{}(" << cvq << " DT" << " * " << is_vol << " &);" << endl;
282	282	}
283	283	for (auto is_vol : { " ", "volatile" }) {
284		cout << "forall(~~dtype DT~~) void ^?{}(" << cvq << " DT" << " * " << is_vol << " &);" << endl;
	284	cout << "forall(DT &) void ^?{}(" << cvq << " DT" << " * " << is_vol << " &);" << endl;
285	285	}
286	286	}
…	…
290	290	for (auto is_vol : { " ", "volatile" }) {
291	291	for (auto cvq : qualifiersSingle) {
292		cout << "forall(~~dtype DT~~) void ?{}( " << cvq << type << " * " << is_vol << " &, zero_t);" << endl;
	292	cout << "forall(DT &) void ?{}( " << cvq << type << " * " << is_vol << " &, zero_t);" << endl;
293	293	}
294	294	}
…	…
317	317	for (auto op : pointerOperators) {
318	318	auto forall = [&op]() {
319		cout << "forall(~~dtype DT~~" << op.sized << ") ";
	319	cout << "forall(DT &" << op.sized << ") ";
320	320	};
321	321	for (auto type : { "DT"/, "void"/ } ) {
…	…
408	408	for (auto is_vol : { " ", "volatile" }) {
409	409	for (auto cvq : qualifiersPair) {
410		cout << "forall(~~dtype DT~~) " << cvq.first << "void * ?=?( " << cvq.first << "void * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
	410	cout << "forall(DT &) " << cvq.first << "void * ?=?( " << cvq.first << "void * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
411	411	}
412	412	for (auto cvq : qualifiersSingle) {
413		cout << "forall(~~dtype DT~~) " << cvq << " DT * ?=?( " << cvq << " DT * " << is_vol << " &, zero_t);" << endl;
	413	cout << "forall(DT &) " << cvq << " DT * ?=?( " << cvq << " DT * " << is_vol << " &, zero_t);" << endl;
414	414	}
415	415	}

libcfa/prelude/prelude.old.cf

-              rb6a8b31
+              rd95969a
 // ------------------------------------------------------------
 trait sized(dtype T) {};
+trait sized(T &) {};
 // ------------------------------------------------------------
 …
 long double _Complex    ?--( long double _Complex & ),          ?--( volatile long double _Complex & );
 forall( dtype T | sized(T) ) T *                         ?++(                T *& );
 forall( dtype T | sized(T) ) const T *           ?++( const          T *& );
 forall( dtype T | sized(T) ) volatile T *                ?++(       volatile T *& );
 forall( dtype T | sized(T) ) const volatile T *  ?++( const volatile T *& );
 forall( dtype T | sized(T) ) T *                         ?--(                T *& );
 forall( dtype T | sized(T) ) const T *           ?--( const          T *& );
 forall( dtype T | sized(T) ) volatile T *                ?--(       volatile T *& );
 forall( dtype T | sized(T) ) const volatile T *  ?--( const volatile T *& );
 forall( dtype T | sized(T) ) T &                 ?[?](                T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const T &   ?[?]( const          T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T &        ?[?](       volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T & ?[?]( const volatile T *,           ptrdiff_t );
 forall( dtype T | sized(T) ) T &                 ?[?](          ptrdiff_t,                T * );
 forall( dtype T | sized(T) ) const T &   ?[?](          ptrdiff_t, const          T * );
 forall( dtype T | sized(T) ) volatile T &        ?[?](          ptrdiff_t,       volatile T * );
 forall( dtype T | sized(T) ) const volatile T & ?[?](           ptrdiff_t, const volatile T * );
+forall( T & | sized(T) ) T *                     ?++(                T *& );
+forall( T & | sized(T) ) const T *               ?++( const          T *& );
+forall( T & | sized(T) ) volatile T *            ?++(       volatile T *& );
+forall( T & | sized(T) ) const volatile T *      ?++( const volatile T *& );
+forall( T & | sized(T) ) T *                     ?--(                T *& );
+forall( T & | sized(T) ) const T *               ?--( const          T *& );
+forall( T & | sized(T) ) volatile T *            ?--(       volatile T *& );
+forall( T & | sized(T) ) const volatile T *      ?--( const volatile T *& );
+forall( T & | sized(T) ) T &             ?[?](                T *,          ptrdiff_t );
+forall( T & | sized(T) ) const T &       ?[?]( const          T *,          ptrdiff_t );
+forall( T & | sized(T) ) volatile T &    ?[?](       volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) const volatile T & ?[?]( const volatile T *,       ptrdiff_t );
+forall( T & | sized(T) ) T &             ?[?](          ptrdiff_t,                T * );
+forall( T & | sized(T) ) const T &       ?[?](          ptrdiff_t, const          T * );
+forall( T & | sized(T) ) volatile T &    ?[?](          ptrdiff_t,       volatile T * );
+forall( T & | sized(T) ) const volatile T & ?[?](               ptrdiff_t, const volatile T * );
 // ------------------------------------------------------------
 …
 long double _Complex    ++?( long double _Complex & ),          --?( long double _Complex & );
 forall( dtype T | sized(T) ) T *                         ++?(                T *& );
 forall( dtype T | sized(T) ) const T *           ++?( const          T *& );
 forall( dtype T | sized(T) ) volatile T *                ++?(       volatile T *& );
 forall( dtype T | sized(T) ) const volatile T *  ++?( const volatile T *& );
 forall( dtype T | sized(T) ) T *                         --?(                T *& );
 forall( dtype T | sized(T) ) const T *           --?( const          T *& );
 forall( dtype T | sized(T) ) volatile T *                --?(       volatile T *& );
 forall( dtype T | sized(T) ) const volatile T *  --?( const volatile T *& );
 forall( dtype T | sized(T) ) T &                 *?(                 T * );
 forall( dtype T | sized(T) ) const T &           *?( const           T * );
 forall( dtype T | sized(T) ) volatile T &        *?(       volatile  T * );
 forall( dtype T | sized(T) ) const volatile T & *?( const volatile  T * );
+forall( T & | sized(T) ) T *                     ++?(                T *& );
+forall( T & | sized(T) ) const T *               ++?( const          T *& );
+forall( T & | sized(T) ) volatile T *            ++?(       volatile T *& );
+forall( T & | sized(T) ) const volatile T *      ++?( const volatile T *& );
+forall( T & | sized(T) ) T *                     --?(                T *& );
+forall( T & | sized(T) ) const T *               --?( const          T *& );
+forall( T & | sized(T) ) volatile T *            --?(       volatile T *& );
+forall( T & | sized(T) ) const volatile T *      --?( const volatile T *& );
+forall( T & | sized(T) ) T &             *?(                 T * );
+forall( T & | sized(T) ) const T &               *?( const           T * );
+forall( T & | sized(T) ) volatile T &    *?(       volatile  T * );
+forall( T & | sized(T) ) const volatile T & *?( const volatile  T * );
 forall( ftype FT ) FT &          *?( FT * );
 …
                 !?( float _Complex ),           !?( double _Complex ),          !?( long double _Complex );
 forall( dtype DT ) int !?(                DT * );
 forall( dtype DT ) int !?( const          DT * );
 forall( dtype DT ) int !?(       volatile DT * );
 forall( dtype DT ) int !?( const volatile DT * );
+forall( DT & ) int !?(                DT * );
+forall( DT & ) int !?( const          DT * );
+forall( DT & ) int !?(       volatile DT * );
+forall( DT & ) int !?( const volatile DT * );
 forall( ftype FT ) int !?( FT * );
 …
 long double _Complex    ?+?( long double _Complex, long double _Complex ),      ?-?( long double _Complex, long double _Complex );
 forall( dtype T | sized(T) ) T *                ?+?(                T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) T *                ?+?(          ptrdiff_t,                T * );
 forall( dtype T | sized(T) ) const T *          ?+?( const          T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?+?(          ptrdiff_t, const          T * );
 forall( dtype T | sized(T) ) volatile T *       ?+?(       volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?+?(          ptrdiff_t,       volatile T * );
 forall( dtype T | sized(T) ) const volatile T * ?+?( const volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?+?(          ptrdiff_t, const volatile T * );
 forall( dtype T | sized(T) ) T *                ?-?(                T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?-?( const          T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?-?(       volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?-?( const volatile T *,          ptrdiff_t );
 forall( dtype T | sized(T) ) ptrdiff_t          ?-?( const volatile T *, const volatile T * );
+forall( T & | sized(T) ) T *            ?+?(                T *,          ptrdiff_t );
+forall( T & | sized(T) ) T *            ?+?(          ptrdiff_t,                T * );
+forall( T & | sized(T) ) const T *              ?+?( const          T *,          ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?+?(          ptrdiff_t, const          T * );
+forall( T & | sized(T) ) volatile T *   ?+?(       volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?+?(          ptrdiff_t,       volatile T * );
+forall( T & | sized(T) ) const volatile T *     ?+?( const volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?+?(          ptrdiff_t, const volatile T * );
+forall( T & | sized(T) ) T *            ?-?(                T *,          ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?-?( const          T *,          ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?-?(       volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?-?( const volatile T *,          ptrdiff_t );
+forall( T & | sized(T) ) ptrdiff_t              ?-?( const volatile T *, const volatile T * );
 // ------------------------------------------------------------
 …
            ?>?( long double, long double ),                             ?>=?( long double, long double );
 forall( dtype DT ) signed int ?<?(                 DT *,                DT * );
 forall( dtype DT ) signed int ?<?(  const          DT *, const          DT * );
 forall( dtype DT ) signed int ?<?(        volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?<?(  const volatile DT *, const volatile DT * );
 forall( dtype DT ) signed int ?>?(                 DT *,                DT * );
 forall( dtype DT ) signed int ?>?(  const          DT *, const          DT * );
 forall( dtype DT ) signed int ?>?(        volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?>?(  const volatile DT *, const volatile DT * );
 forall( dtype DT ) signed int ?<=?(                 DT *,                DT * );
 forall( dtype DT ) signed int ?<=?(  const          DT *, const          DT * );
 forall( dtype DT ) signed int ?<=?(        volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?<=?( const volatile DT *, const volatile DT * );
 forall( dtype DT ) signed int ?>=?(                 DT *,                DT * );
 forall( dtype DT ) signed int ?>=?(  const          DT *, const          DT * );
 forall( dtype DT ) signed int ?>=?(        volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?>=?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?<?(                 DT *,                DT * );
+forall( DT & ) signed int ?<?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?<?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?<?(  const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?>?(                 DT *,                DT * );
+forall( DT & ) signed int ?>?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?>?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?>?(  const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?<=?(                 DT *,                DT * );
+forall( DT & ) signed int ?<=?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?<=?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?<=?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?>=?(                 DT *,                DT * );
+forall( DT & ) signed int ?>=?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?>=?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?>=?( const volatile DT *, const volatile DT * );
 // ------------------------------------------------------------
 …
 signed int ?==?( one_t, one_t ),                                                        ?!=?( one_t, one_t );
 forall( dtype DT ) signed int ?==?(                DT *,                DT * );
 forall( dtype DT ) signed int ?==?( const          DT *, const          DT * );
 forall( dtype DT ) signed int ?==?(       volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?==?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?==?(            DT *,                DT * );
+forall( DT & ) signed int ?==?( const      DT *, const          DT * );
+forall( DT & ) signed int ?==?(       volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?==?( const volatile DT *, const volatile DT * );
 forall( ftype FT ) signed int ?==?( FT *, FT * );
 forall( dtype DT ) signed int ?!=?(                DT *,                DT * );
 forall( dtype DT ) signed int ?!=?( const          DT *, const          DT * );
 forall( dtype DT ) signed int ?!=?(       volatile DT *,       volatile DT * );
 forall( dtype DT ) signed int ?!=?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?!=?(            DT *,                DT * );
+forall( DT & ) signed int ?!=?( const      DT *, const          DT * );
+forall( DT & ) signed int ?!=?(       volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?!=?( const volatile DT *, const volatile DT * );
 forall( ftype FT ) signed int ?!=?( FT *, FT * );
 …
 forall( ftype FT ) FT *                 ?=?( FT *&, FT * );
 forall( ftype FT ) FT *                 ?=?( FT * volatile &, FT * );
 forall( dtype DT ) DT *                 ?=?(                 DT *          &,                   DT * );
 forall( dtype DT ) DT *                 ?=?(                 DT * volatile &,                   DT * );
 forall( dtype DT ) const DT *           ?=?( const           DT *          &,                   DT * );
 forall( dtype DT ) const DT *           ?=?( const           DT * volatile &,                   DT * );
 forall( dtype DT ) const DT *           ?=?( const           DT *          &, const             DT * );
 forall( dtype DT ) const DT *           ?=?( const           DT * volatile &, const             DT * );
 forall( dtype DT ) volatile DT *        ?=?(       volatile  DT *          &,                   DT * );
 forall( dtype DT ) volatile DT *        ?=?(       volatile  DT * volatile &,                   DT * );
 forall( dtype DT ) volatile DT *        ?=?(       volatile  DT *          &,       volatile    DT * );
 forall( dtype DT ) volatile DT *        ?=?(       volatile  DT * volatile &,       volatile    DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *          &,                   DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &,                   DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *          &, const             DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &, const             DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *          &,       volatile    DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &,       volatile    DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *          &, const volatile    DT * );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &, const volatile    DT * );
 forall( dtype DT ) void *                ?=?(                void *          &,                 DT * );
 forall( dtype DT ) void *                ?=?(                void * volatile &,                 DT * );
 forall( dtype DT ) const void *          ?=?( const          void *          &,                 DT * );
 forall( dtype DT ) const void *          ?=?( const          void * volatile &,                 DT * );
 forall( dtype DT ) const void *          ?=?( const          void *          &, const           DT * );
 forall( dtype DT ) const void *          ?=?( const          void * volatile &, const           DT * );
 forall( dtype DT ) volatile void *       ?=?(       volatile void *          &,                 DT * );
 forall( dtype DT ) volatile void *       ?=?(       volatile void * volatile &,                 DT * );
 forall( dtype DT ) volatile void *       ?=?(       volatile void *          &,       volatile  DT * );
 forall( dtype DT ) volatile void *       ?=?(       volatile void * volatile &,       volatile  DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void *          &,                 DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &,                 DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void *          &, const           DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &, const           DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void *          &,       volatile  DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &,       volatile  DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void *          &, const volatile  DT * );
 forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &, const volatile  DT * );
+forall( ftyep FT ) FT *                 ?=?( FT * volatile &, FT * );
+forall( DT & ) DT *                     ?=?(                 DT *          &,                   DT * );
+forall( DT & ) DT *                     ?=?(                 DT * volatile &,                   DT * );
+forall( DT & ) const DT *               ?=?( const           DT *          &,                   DT * );
+forall( DT & ) const DT *               ?=?( const           DT * volatile &,                   DT * );
+forall( DT & ) const DT *               ?=?( const           DT *          &, const             DT * );
+forall( DT & ) const DT *               ?=?( const           DT * volatile &, const             DT * );
+forall( DT & ) volatile DT *    ?=?(       volatile  DT *          &,                   DT * );
+forall( DT & ) volatile DT *    ?=?(       volatile  DT * volatile &,                   DT * );
+forall( DT & ) volatile DT *    ?=?(       volatile  DT *          &,       volatile    DT * );
+forall( DT & ) volatile DT *    ?=?(       volatile  DT * volatile &,       volatile    DT * );
+forall( DT & ) const volatile DT *      ?=?( const volatile  DT *          &,                   DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &,                       DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *      &, const             DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &, const         DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *      &,       volatile    DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &,           volatile    DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *      &, const volatile    DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &, const volatile        DT * );
+forall( DT & ) void *            ?=?(                void *          &,                 DT * );
+forall( DT & ) void *            ?=?(                void * volatile &,                 DT * );
+forall( DT & ) const void *              ?=?( const          void *          &,                 DT * );
+forall( DT & ) const void *              ?=?( const          void * volatile &,                 DT * );
+forall( DT & ) const void *              ?=?( const          void *          &, const           DT * );
+forall( DT & ) const void *              ?=?( const          void * volatile &, const           DT * );
+forall( DT & ) volatile void *   ?=?(       volatile void *          &,                 DT * );
+forall( DT & ) volatile void *   ?=?(       volatile void * volatile &,                 DT * );
+forall( DT & ) volatile void *   ?=?(       volatile void *          &,       volatile  DT * );
+forall( DT & ) volatile void *   ?=?(       volatile void * volatile &,       volatile  DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *      &,                 DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &,                     DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *      &, const           DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &, const               DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *      &,       volatile  DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &,           volatile  DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *      &, const volatile  DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &, const volatile      DT * );
 //forall( dtype DT ) DT *                       ?=?(                DT *          &, zero_t );
 //forall( dtype DT ) DT *                       ?=?(                DT * volatile &, zero_t );
 forall( dtype DT ) const DT *           ?=?( const          DT *          &, zero_t );
 forall( dtype DT ) const DT *           ?=?( const          DT * volatile &, zero_t );
+forall( DT & ) const DT *               ?=?( const          DT *          &, zero_t );
+forall( DT & ) const DT *               ?=?( const          DT * volatile &, zero_t );
 //forall( dtype DT ) volatile DT *      ?=?( volatile       DT *          &, zero_t );
 //forall( dtype DT ) volatile DT *      ?=?( volatile       DT * volatile &, zero_t );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile DT *          &, zero_t );
 forall( dtype DT ) const volatile DT *  ?=?( const volatile DT * volatile &, zero_t );
+forall( DT & ) const volatile DT *      ?=?( const volatile DT *          &, zero_t );
+forall( DT & ) const volatile DT *      ?=?( const volatile DT * volatile &, zero_t );
 forall( ftype FT ) FT *                 ?=?( FT *          &, zero_t );
 forall( ftype FT ) FT *                 ?=?( FT * volatile &, zero_t );
 forall( dtype T | sized(T) ) T *                ?+=?(                T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) T *                ?+=?(                T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?+=?( const          T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?+=?( const          T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?+=?(       volatile T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?+=?(       volatile T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?+=?( const volatile T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?+=?( const volatile T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) T *                ?-=?(                T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) T *                ?-=?(                T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?-=?( const          T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) const T *          ?-=?( const          T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?-=?(       volatile T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) volatile T *       ?-=?(       volatile T * volatile &, ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?-=?( const volatile T *          &, ptrdiff_t );
 forall( dtype T | sized(T) ) const volatile T * ?-=?( const volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) T *            ?+=?(                T *          &, ptrdiff_t );
+forall( T & | sized(T) ) T *            ?+=?(                T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?+=?( const          T *          &, ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?+=?( const          T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?+=?(       volatile T *          &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?+=?(       volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?+=?( const volatile T *          &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?+=?( const volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) T *            ?-=?(                T *          &, ptrdiff_t );
+forall( T & | sized(T) ) T *            ?-=?(                T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?-=?( const          T *          &, ptrdiff_t );
+forall( T & | sized(T) ) const T *              ?-=?( const          T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?-=?(       volatile T *          &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *   ?-=?(       volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?-=?( const volatile T *          &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *     ?-=?( const volatile T * volatile &, ptrdiff_t );
 _Bool                   ?=?( _Bool &, _Bool ),                                  ?=?( volatile _Bool &, _Bool );
 …
 forall( ftype FT ) void ?{}( FT * volatile &, FT * );
 forall( dtype DT ) void ?{}(                 DT *          &,                   DT * );
 forall( dtype DT ) void ?{}( const           DT *          &,                   DT * );
 forall( dtype DT ) void ?{}( const           DT *          &, const             DT * );
 forall( dtype DT ) void ?{}(       volatile  DT *          &,                   DT * );
 forall( dtype DT ) void ?{}(       volatile  DT *          &,       volatile    DT * );
 forall( dtype DT ) void ?{}( const volatile  DT *          &,                   DT * );
 forall( dtype DT ) void ?{}( const volatile  DT *          &, const             DT * );
 forall( dtype DT ) void ?{}( const volatile  DT *          &,       volatile    DT * );
 forall( dtype DT ) void ?{}( const volatile  DT *          &, const volatile    DT * );
 forall( dtype DT ) void ?{}(                 void *          &,                 DT * );
 forall( dtype DT ) void ?{}( const           void *          &,                 DT * );
 forall( dtype DT ) void ?{}( const           void *          &, const           DT * );
 forall( dtype DT ) void ?{}(        volatile void *          &,                 DT * );
 forall( dtype DT ) void ?{}(        volatile void *          &,       volatile  DT * );
 forall( dtype DT ) void ?{}( const volatile void *           &,                 DT * );
 forall( dtype DT ) void ?{}( const volatile void *           &, const           DT * );
 forall( dtype DT ) void ?{}( const volatile void *           &,       volatile  DT * );
 forall( dtype DT ) void ?{}( const volatile void *           &, const volatile  DT * );
+forall( DT & ) void ?{}(                     DT *          &,                   DT * );
+forall( DT & ) void ?{}( const       DT *          &,                   DT * );
+forall( DT & ) void ?{}( const       DT *          &, const             DT * );
+forall( DT & ) void ?{}(           volatile  DT *          &,                   DT * );
+forall( DT & ) void ?{}(           volatile  DT *          &,       volatile    DT * );
+forall( DT & ) void ?{}( const volatile  DT *      &,                   DT * );
+forall( DT & ) void ?{}( const volatile  DT *      &, const             DT * );
+forall( DT & ) void ?{}( const volatile  DT *      &,       volatile    DT * );
+forall( DT & ) void ?{}( const volatile  DT *      &, const volatile    DT * );
+forall( DT & ) void ?{}(                     void *          &,                 DT * );
+forall( DT & ) void ?{}( const       void *          &,                 DT * );
+forall( DT & ) void ?{}( const       void *          &, const           DT * );
+forall( DT & ) void ?{}(            volatile void *          &,                 DT * );
+forall( DT & ) void ?{}(            volatile void *          &,       volatile  DT * );
+forall( DT & ) void ?{}( const volatile void *       &,                 DT * );
+forall( DT & ) void ?{}( const volatile void *       &, const           DT * );
+forall( DT & ) void ?{}( const volatile void *       &,       volatile  DT * );
+forall( DT & ) void ?{}( const volatile void *       &, const volatile  DT * );
 //forall( dtype DT ) void ?{}(              DT *          &, zero_t );
 //forall( dtype DT ) void ?{}(              DT * volatile &, zero_t );
 forall( dtype DT ) void ?{}( const          DT *          &, zero_t );
+forall( DT & ) void ?{}( const      DT *          &, zero_t );
 //forall( dtype DT ) void ?{}( volatile     DT *          &, zero_t );
 //forall( dtype DT ) void ?{}( volatile     DT * volatile &, zero_t );
 forall( dtype DT ) void ?{}( const volatile DT *          &, zero_t );
+forall( DT & ) void ?{}( const volatile DT *      &, zero_t );
 forall( ftype FT ) void ?{}( FT *          &, zero_t );
 …
 forall( ftype FT ) void ?{}( FT *          & );
 forall( dtype DT ) void ?{}(                 DT *          &);
 forall( dtype DT ) void ?{}( const           DT *          &);
 forall( dtype DT ) void ?{}(       volatile  DT *          &);
 forall( dtype DT ) void ?{}( const volatile  DT *          &);
+forall( DT & ) void     ?{}(                 DT *          &);
+forall( DT & ) void     ?{}( const           DT *          &);
+forall( DT & ) void     ?{}(       volatile  DT *          &);
+forall( DT & ) void ?{}( const volatile  DT *      &);
 void    ?{}(                void *          &);
 …
 forall( ftype FT ) void ^?{}( FT *         & );
 forall( dtype DT ) void ^?{}(                DT *          &);
 forall( dtype DT ) void ^?{}( const          DT *          &);
 forall( dtype DT ) void ^?{}(      volatile  DT *          &);
 forall( dtype DT ) void ^?{}( const volatile  DT *         &);
+forall( DT & ) void     ^?{}(                DT *          &);
+forall( DT & ) void     ^?{}( const          DT *          &);
+forall( DT & ) void     ^?{}(      volatile  DT *          &);
+forall( DT & ) void ^?{}( const volatile  DT *     &);
 void ^?{}(                  void *          &);

libcfa/prelude/sync-builtins.cf

-              rb6a8b31
+              rd95969a
 _Bool __sync_bool_compare_and_swap(volatile unsigned __int128 *, unsigned __int128, unsigned __int128,...);
 #endif
 forall(dtype T) _Bool __sync_bool_compare_and_swap(T * volatile *, T *, T*, ...);
+forall(T &) _Bool __sync_bool_compare_and_swap(T * volatile *, T *, T*, ...);
 char __sync_val_compare_and_swap(volatile char *, char, char,...);
 …
 unsigned __int128 __sync_val_compare_and_swap(volatile unsigned __int128 *, unsigned __int128, unsigned __int128,...);
 #endif
 forall(dtype T) T * __sync_val_compare_and_swap(T * volatile *, T *, T*,...);
+forall(T &) T * __sync_val_compare_and_swap(T * volatile *, T *, T*,...);
 char __sync_lock_test_and_set(volatile char *, char,...);
 …
 void __atomic_exchange(volatile unsigned __int128 *, volatile unsigned __int128 *, volatile unsigned __int128 *, int);
 #endif
 forall(dtype T) T * __atomic_exchange_n(T * volatile *, T *, int);
 forall(dtype T) void __atomic_exchange(T * volatile *, T * volatile *, T * volatile *, int);
+forall(T &) T * __atomic_exchange_n(T * volatile *, T *, int);
+forall(T &) void __atomic_exchange(T * volatile *, T * volatile *, T * volatile *, int);
 _Bool __atomic_load_n(const volatile _Bool *, int);
 …
 void __atomic_load(const volatile unsigned __int128 *, volatile unsigned __int128 *, int);
 #endif
 forall(dtype T) T * __atomic_load_n(T * const volatile *, int);
 forall(dtype T) void __atomic_load(T * const volatile *, T **, int);
+forall(T &) T * __atomic_load_n(T * const volatile *, int);
+forall(T &) void __atomic_load(T * const volatile *, T **, int);
 _Bool __atomic_compare_exchange_n(volatile char *, char *, char, _Bool, int, int);
 …
 _Bool __atomic_compare_exchange   (volatile unsigned __int128 *, unsigned __int128 *, unsigned __int128 *, _Bool, int, int);
 #endif
 forall(dtype T) _Bool __atomic_compare_exchange_n (T * volatile *, T **, T*, _Bool, int, int);
 forall(dtype T) _Bool __atomic_compare_exchange   (T * volatile *, T **, T**, _Bool, int, int);
+forall(T &) _Bool __atomic_compare_exchange_n (T * volatile *, T **, T*, _Bool, int, int);
+forall(T &) _Bool __atomic_compare_exchange   (T * volatile *, T **, T**, _Bool, int, int);
 void __atomic_store_n(volatile _Bool *, _Bool, int);
 …
 void __atomic_store(volatile unsigned __int128 *, unsigned __int128 *, int);
 #endif
 forall(dtype T) void __atomic_store_n(T * volatile *, T *, int);
 forall(dtype T) void __atomic_store(T * volatile *, T **, int);
+forall(T &) void __atomic_store_n(T * volatile *, T *, int);
+forall(T &) void __atomic_store(T * volatile *, T **, int);
 char __atomic_add_fetch  (volatile char *, char, int);

libcfa/src/Makefile.am

rb6a8b31	rd95969a
76	76	stdlib.hfa \
77	77	time.hfa \
	78	bits/weakso_locks.hfa \
78	79	containers/maybe.hfa \
79	80	containers/pair.hfa \

libcfa/src/bitmanip.hfa

-              rb6a8b31
+              rd95969a
         unsigned long long int floor2( unsigned long long int n, unsigned long long int align ) { verify( is_pow2( align ) ); return n & -align; }
         // forall( otype T | { T ?&?( T, T ); T -?( T ); } )
+        // forall( T | { T ?&?( T, T ); T -?( T ); } )
         // T floor2( T n, T align ) { verify( is_pow2( align ) ); return n & -align; }
 …
         unsigned long long int ceiling2( unsigned long long int n, unsigned long long int align ) { verify( is_pow2( align ) ); return -floor2( -n, align ); }
         // forall( otype T | { T floor2( T, T ); T -?( T ); } )
+        // forall( T | { T floor2( T, T ); T -?( T ); } )
         // T ceiling2( T n, T align ) { verify( is_pow2( align ) ); return -floor2( -n, align ); }
 } // distribution

libcfa/src/bits/algorithm.hfa

-              rb6a8b31
+              rd95969a
 #ifdef SAFE_SORT
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort2( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort3( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort4( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort5( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort6( T * arr );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sortN( T * arr, size_t dim );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort2( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort3( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort4( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort5( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort6( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sortN( T * arr, size_t dim );
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort( T * arr, size_t dim ) {
         switch( dim ) {
 …
 #define SWAP(x,y) { T a = min(arr[x], arr[y]); T b = max(arr[x], arr[y]); arr[x] = a; arr[y] = b;}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort2( T * arr ) {
         SWAP(0, 1);
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort3( T * arr ) {
         SWAP(1, 2);
 …
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort4( T * arr ) {
         SWAP(0, 1);
 …
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort5( T * arr ) {
         SWAP(0, 1);
 …
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort6( T * arr ) {
         SWAP(1, 2);
 …
+}
 forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sortN( T * arr, size_t dim ) {
         int i, j;
 …
 static inline void __libcfa_small_sortN( void* * arr, size_t dim );
 forall( dtype T )
+forall( T & )
 static inline void __libcfa_small_sort( T* * arr, size_t dim ) {
         switch( dim ) {

libcfa/src/bits/collection.hfa

-              rb6a8b31
+              rd95969a
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// bits/collection.hfa -- PUBLIC
+// Intrusive singly-linked list
+//
+// Author           : Colby Alexander Parsons & Peter A. Buhr
+// Created On       : Thu Jan 21 19:46:50 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
 #pragma once
-#include <stdio.h> // REMOVE THIS AFTER DEBUGGING
 struct Colable {
         struct Colable * next;                                                                          // next node in the list
+        // next node in the list
         // invariant: (next != 0) <=> listed()
+        struct Colable * next;
 };
 #ifdef __cforall
 …
         // // wrappers to make Collection have T
         // forall( dtype T ) {
+        // forall( T & ) {
         //      T *& Next( T * n ) {
         //              return (T *)Next( (Colable *)n );
 …
 } // distribution
 forall( dtype T | { T *& Next ( T * ); } ) {
+static inline forall( T & | { T *& Next ( T * ); } ) {
         bool listed( T * n ) {
                 return Next( n ) != 0p;
 …
         Collection & ?=?( const Collection & ) = void;          // no assignment
         void ?{}( Collection & collection ) with( collection ) {
+        void ?{}( Collection & collection ) with( collection ) {
                 root = 0p;
         } // post: empty()
 …
         } // post: elts = null
         forall( dtype T ) {
+        forall( T & ) {
                 T * Curr( ColIter & ci ) with( ci ) {
                         return (T *)curr;

libcfa/src/bits/containers.hfa

-              rb6a8b31
+              rd95969a
 #ifdef __cforall
         forall(dtype T)
+        forall(T &)
 #else
         #define T void
 …
 #ifdef __cforall
         // forall(otype T | sized(T))
+        // forall(T | sized(T))
         // static inline void ?{}(__small_array(T) & this) {}
         forall(dtype T | sized(T))
+        forall(T & | sized(T))
         static inline T & ?[?]( __small_array(T) & this, __lock_size_t idx ) {
                 return ((typeof(this.data))this.data)[idx];
+        }
         forall(dtype T | sized(T))
+        forall(T & | sized(T))
         static inline T & ?[?]( const __small_array(T) & this, __lock_size_t idx ) {
                 return ((typeof(this.data))this.data)[idx];
+        }
         forall(dtype T)
+        forall(T &)
         static inline T * begin( const __small_array(T) & this ) {
                 return ((typeof(this.data))this.data);
+        }
         forall(dtype T | sized(T))
+        forall(T & | sized(T))
         static inline T * end( const __small_array(T) & this ) {
                 return ((typeof(this.data))this.data) + this.size;
 …
 #ifdef __cforall
         trait is_node(dtype T) {
+        trait is_node(T &) {
                 T *& get_next( T & );
         };
 …
 //-----------------------------------------------------------------------------
 #ifdef __cforall
         forall(dtype TYPE)
+        forall(TYPE &)
         #define T TYPE
 #else
 …
 #ifdef __cforall
         forall(dtype T)
+        forall(T &)
         static inline void ?{}( __stack(T) & this ) {
                 (this.top){ 0p };
+        }
         static inline forall( dtype T | is_node(T) ) {
+        static inline forall( T & | is_node(T) ) {
                 void push( __stack(T) & this, T * val ) {
                         verify( !get_next( *val ) );
 …
 //-----------------------------------------------------------------------------
 #ifdef __cforall
         forall(dtype TYPE)
+        forall(TYPE &)
         #define T TYPE
 #else
 …
 #ifdef __cforall
         static inline forall( dtype T | is_node(T) ) {
+        static inline forall( T & | is_node(T) ) {
                 void ?{}( __queue(T) & this ) with( this ) {
                         (this.head){ 1p };
 …
 //-----------------------------------------------------------------------------
 #ifdef __cforall
         forall(dtype TYPE)
+        forall(TYPE &)
         #define T TYPE
         #define __getter_t * [T * & next, T * & prev] ( T & )
 …
 #ifdef __cforall
         forall(dtype T )
+        forall(T & )
         static inline [void] ?{}( __dllist(T) & this, * [T * & next, T * & prev] ( T & ) __get ) {
                 (this.head){ 0p };
 …
         #define next 0
         #define prev 1
         static inline forall(dtype T) {
+        static inline forall(T &) {
                 void push_front( __dllist(T) & this, T & node ) with( this ) {
                         verify(__get);

libcfa/src/bits/defs.hfa

-              rb6a8b31
+              rd95969a
 // file "LICENCE" distributed with Cforall.
 //
+// defs.hfa --
+// defs.hfa -- Commen macros, functions and typedefs
+// Most files depend on them and they are always useful to have.
+//
+//  *** Must not contain code specific to libcfathread ***
 //
 // Author           : Thierry Delisle
 …
         #endif
+}
+// pause to prevent excess processor bus usage
+#if defined( __i386 ) || defined( __x86_64 )
+        #define Pause() __asm__ __volatile__ ( "pause" : : : )
+#elif defined( __ARM_ARCH )
+        #define Pause() __asm__ __volatile__ ( "YIELD" : : : )
+#else
+        #error unsupported architecture
+#endif

libcfa/src/bits/locks.hfa

-              rb6a8b31
+              rd95969a
 // file "LICENCE" distributed with Cforall.
 //
+// bits/locks.hfa -- Fast internal locks.
+// bits/locks.hfa -- Basic spinlocks that are reused in the system.
+// Used for locks that aren't specific to cforall threads and can be used anywhere
+//
+//  *** Must not contain code specific to libcfathread ***
 //
 // Author           : Thierry Delisle
 …
 #include "bits/defs.hfa"
 #include <assert.h>
-#ifdef __cforall
-        extern "C" {
-                #include <pthread.h>
+        }
-#endif
-// pause to prevent excess processor bus usage
-#if defined( __i386 ) || defined( __x86_64 )
-        #define Pause() __asm__ __volatile__ ( "pause" : : : )
-#elif defined( __ARM_ARCH )
-        #define Pause() __asm__ __volatile__ ( "YIELD" : : : )
-#else
-        #error unsupported architecture
-#endif
 struct __spinlock_t {
 …
                 enable_interrupts_noPoll();
+        }
-        #ifdef __CFA_WITH_VERIFY__
-                extern bool __cfaabi_dbg_in_kernel();
-        #endif
-        extern "C" {
-                char * strerror(int);
+        }
-        #define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
-        struct __bin_sem_t {
-                pthread_mutex_t         lock;
-                pthread_cond_t          cond;
-                int                     val;
-        };
-        static inline void ?{}(__bin_sem_t & this) with( this ) {
-                // Create the mutex with error checking
-                pthread_mutexattr_t mattr;
-                pthread_mutexattr_init( &mattr );
-                pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
-                pthread_mutex_init(&lock, &mattr);
-                pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
-                val = 0;
+        }
-        static inline void ^?{}(__bin_sem_t & this) with( this ) {
-                CHECKED( pthread_mutex_destroy(&lock) );
-                CHECKED( pthread_cond_destroy (&cond) );
+        }
-        static inline void wait(__bin_sem_t & this) with( this ) {
-                verify(__cfaabi_dbg_in_kernel());
-                CHECKED( pthread_mutex_lock(&lock) );
-                        while(val < 1) {
-                                pthread_cond_wait(&cond, &lock);
+                        }
-                        val -= 1;
-                CHECKED( pthread_mutex_unlock(&lock) );
+        }
-        static inline bool post(__bin_sem_t & this) with( this ) {
-                bool needs_signal = false;
-                CHECKED( pthread_mutex_lock(&lock) );
-                        if(val < 1) {
-                                val += 1;
-                                pthread_cond_signal(&cond);
-                                needs_signal = true;
+                        }
-                CHECKED( pthread_mutex_unlock(&lock) );
-                return needs_signal;
+        }
-        #undef CHECKED
-        struct $thread;
-        extern void park( void );
-        extern void unpark( struct $thread * this );
-        static inline struct $thread * active_thread ();
-        // Semaphore which only supports a single thread
-        struct single_sem {
-                struct $thread * volatile ptr;
-        };
-        static inline {
-                void  ?{}(single_sem & this) {
-                        this.ptr = 0p;
+                }
-                void ^?{}(single_sem &) {}
-                bool wait(single_sem & this) {
-                        for() {
-                                struct $thread * expected = this.ptr;
-                                if(expected == 1p) {
-                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                                return false;
+                                        }
+                                }
-                                else {
-                                        /* paranoid */ verify( expected == 0p );
-                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                                park();
-                                                return true;
+                                        }
+                                }
+                        }
+                }
-                bool post(single_sem & this) {
-                        for() {
-                                struct $thread * expected = this.ptr;
-                                if(expected == 1p) return false;
-                                if(expected == 0p) {
-                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                                return false;
+                                        }
+                                }
-                                else {
-                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                                unpark( expected );
-                                                return true;
+                                        }
+                                }
+                        }
+                }
+        }
-        // Synchronozation primitive which only supports a single thread and one post
-        // Similar to a binary semaphore with a 'one shot' semantic
-        // is expected to be discarded after each party call their side
-        struct oneshot {
-                // Internal state :
-                //     0p     : is initial state (wait will block)
-                //     1p     : fulfilled (wait won't block)
-                // any thread : a thread is currently waiting
-                struct $thread * volatile ptr;
-        };
-        static inline {
-                void  ?{}(oneshot & this) {
-                        this.ptr = 0p;
+                }
-                void ^?{}(oneshot &) {}
-                // Wait for the post, return immidiately if it already happened.
-                // return true if the thread was parked
-                bool wait(oneshot & this) {
-                        for() {
-                                struct $thread * expected = this.ptr;
-                                if(expected == 1p) return false;
-                                /* paranoid */ verify( expected == 0p );
-                                if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                        park();
-                                        /* paranoid */ verify( this.ptr == 1p );
-                                        return true;
+                                }
+                        }
+                }
-                // Mark as fulfilled, wake thread if needed
-                // return true if a thread was unparked
-                bool post(oneshot & this) {
-                        struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
-                        if( got == 0p ) return false;
-                        unpark( got );
-                        return true;
+                }
+        }
-        // base types for future to build upon
-        // It is based on the 'oneshot' type to allow multiple futures
-        // to block on the same instance, permitting users to block a single
-        // thread on "any of" [a given set of] futures.
-        // does not support multiple threads waiting on the same future
-        struct future_t {
-                // Internal state :
-                //     0p      : is initial state (wait will block)
-                //     1p      : fulfilled (wait won't block)
-                //     2p      : in progress ()
-                //     3p      : abandoned, server should delete
-                // any oneshot : a context has been setup to wait, a thread could wait on it
-                struct oneshot * volatile ptr;
-        };
-        static inline {
-                void  ?{}(future_t & this) {
-                        this.ptr = 0p;
+                }
-                void ^?{}(future_t &) {}
-                void reset(future_t & this) {
-                        // needs to be in 0p or 1p
-                        __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                }
-                // check if the future is available
-                bool available( future_t & this ) {
-                        return this.ptr == 1p;
+                }
-                // Prepare the future to be waited on
-                // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
-                bool setup( future_t & this, oneshot & wait_ctx ) {
-                        /* paranoid */ verify( wait_ctx.ptr == 0p );
-                        // The future needs to set the wait context
-                        for() {
-                                struct oneshot * expected = this.ptr;
-                                // Is the future already fulfilled?
-                                if(expected == 1p) return false; // Yes, just return false (didn't block)
-                                // The future is not fulfilled, try to setup the wait context
-                                /* paranoid */ verify( expected == 0p );
-                                if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                        return true;
+                                }
+                        }
+                }
-                // Stop waiting on a future
-                // When multiple futures are waited for together in "any of" pattern
-                // futures that weren't fulfilled before the thread woke up
-                // should retract the wait ctx
-                // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
-                void retract( future_t & this, oneshot & wait_ctx ) {
-                        // Remove the wait context
-                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
-                        // got == 0p: future was never actually setup, just return
-                        if( got == 0p ) return;
-                        // got == wait_ctx: since fulfil does an atomic_swap,
-                        // if we got back the original then no one else saw context
-                        // It is safe to delete (which could happen after the return)
-                        if( got == &wait_ctx ) return;
-                        // got == 1p: the future is ready and the context was fully consumed
-                        // the server won't use the pointer again
-                        // It is safe to delete (which could happen after the return)
-                        if( got == 1p ) return;
-                        // got == 2p: the future is ready but the context hasn't fully been consumed
-                        // spin until it is safe to move on
-                        if( got == 2p ) {
-                                while( this.ptr != 1p ) Pause();
-                                return;
+                        }
-                        // got == any thing else, something wen't wrong here, abort
-                        abort("Future in unexpected state");
+                }
-                // Mark the future as abandoned, meaning it will be deleted by the server
-                bool abandon( future_t & this ) {
-                        /* paranoid */ verify( this.ptr != 3p );
-                        // Mark the future as abandonned
-                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
-                        // If the future isn't already fulfilled, let the server delete it
-                        if( got == 0p ) return false;
-                        // got == 2p: the future is ready but the context hasn't fully been consumed
-                        // spin until it is safe to move on
-                        if( got == 2p ) {
-                                while( this.ptr != 1p ) Pause();
-                                got = 1p;
+                        }
-                        // The future is completed delete it now
-                        /* paranoid */ verify( this.ptr != 1p );
-                        free( &this );
-                        return true;
+                }
-                // from the server side, mark the future as fulfilled
-                // delete it if needed
-                bool fulfil( future_t & this ) {
-                        for() {
-                                struct oneshot * expected = this.ptr;
-                                // was this abandoned?
-                                #if defined(__GNUC__) && __GNUC__ >= 7
-                                        #pragma GCC diagnostic push
-                                        #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
-                                #endif
-                                        if( expected == 3p ) { free( &this ); return false; }
-                                #if defined(__GNUC__) && __GNUC__ >= 7
-                                        #pragma GCC diagnostic pop
-                                #endif
-                                /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
-                                /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
-                                // If there is a wait context, we need to consume it and mark it as consumed after
-                                // If there is no context then we can skip the in progress phase
-                                struct oneshot * want = expected == 0p ? 1p : 2p;
-                                if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-                                        if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
-                                        bool ret = post( *expected );
-                                        __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
-                                        return ret;
+                                }
+                        }
+                }
-                // Wait for the future to be fulfilled
-                bool wait( future_t & this ) {
-                        oneshot temp;
-                        if( !setup(this, temp) ) return false;
-                        // Wait context is setup, just wait on it
-                        bool ret = wait( temp );
-                        // Wait for the future to tru
-                        while( this.ptr == 2p ) Pause();
-                        // Make sure the state makes sense
-                        // Should be fulfilled, could be in progress but it's out of date if so
-                        // since if that is the case, the oneshot was fulfilled (unparking this thread)
-                        // and the oneshot should not be needed any more
-                        __attribute__((unused)) struct oneshot * was = this.ptr;
-                        /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
-                        // Mark the future as fulfilled, to be consistent
-                        // with potential calls to avail
-                        // this.ptr = 1p;
-                        return ret;
+                }
+        }
 #endif

libcfa/src/bits/queue.hfa

-              rb6a8b31
+              rd95969a
 // instead of being null.
 forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
         struct Queue {
                 inline Collection;                                                              // Plan 9 inheritance
 …
 } // distribution
 forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
         struct QueueIter {
                 inline ColIter;                                                                 // Plan 9 inheritance

libcfa/src/bits/sequence.hfa

-              rb6a8b31
+              rd95969a
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// bits/sequence.hfa -- PUBLIC
+// Intrusive doubly-linked list
+//
+// Author           : Colby Alexander Parsons & Peter A. Buhr
+// Created On       : Thu Jan 21 19:46:50 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
 #pragma once
 …
 struct Seqable {
         __cfa_anonymous_object(Colable);
+        struct Seqable * back;                                                                          // pointer to previous node in the list
+        // pointer to previous node in the list
+        struct Seqable * back;
 };
 …
                 return sq->back;
+        }
-        // // wrappers to make Collection have T
-        // forall( dtype T ) {
-        //      T *& Back( T * n ) {
-        //              return (T *)Back( (Seqable *)n );
-        //      }
-        // } // distribution
 } // distribution
 …
 // and the back field of the last node points at the first node (circular).
 forall( dtype T | { T *& Back ( T * ); T *& Next ( T * ); } ) {
+forall( T & ) {
         struct Sequence {
+                inline Collection;                                                              // Plan 9 inheritance
+                // Plan 9 inheritance
+                inline Collection;
         };
         static inline {
+                void ?{}( Sequence(T) &, const Sequence(T) & ) = void; // no copy
+                Sequence(T) & ?=?( const Sequence(T) & ) = void; // no assignment
+                void ?{}( Sequence(T) & s ) with( s ) {
+                        ((Collection &)s){};
+                }       // post: isEmpty()
+        }
+        static inline forall(| { T *& Back ( T * ); T *& Next ( T * ); }) {
                 // wrappers to make Collection have T
                 T & head( Sequence(T) & s ) with( s ) {
                         return *(T *)head( (Collection &)s );
                 } // post: empty() & head() == 0 | !empty() & head() in *s
-                void ?{}( Sequence(T) &, const Sequence(T) & ) = void; // no copy
-                Sequence(T) & ?=?( const Sequence(T) & ) = void; // no assignment
-                void ?{}( Sequence(T) & s ) with( s ) {
-                        ((Collection &)s){};
-                }       // post: isEmpty()
                 // Return a pointer to the last sequence element, without removing it.
 …
                         return n;
                 } // post: n->listed() & *n in *s & succ(n) == bef
                 // pre: n->listed() & *n in *s
                 T & remove( Sequence(T) & s, T & n ) with( s ) { // O(1)
 …
 } // distribution
 forall( dtype T | { T *& Back ( T * ); T *& Next ( T * ); } ) {
+forall( T & | { T *& Back ( T * ); T *& Next ( T * ); } ) {
         // SeqIter(T) is used to iterate over a Sequence(T) in head-to-tail order.
         struct SeqIter {
 …
         static inline {
                 void ?{}( SeqIterRev(T) & si ) with( si ) {
+                void ?{}( SeqIterRev(T) & si ) with( si ) {
                         ((ColIter &)si){};
                         seq = 0p;
 …
                 // Create a iterator active in sequence s.
                 void ?{}( SeqIterRev(T) & si, Sequence(T) & s ) with( si ) {
+                void ?{}( SeqIterRev(T) & si, Sequence(T) & s ) with( si ) {
                         ((ColIter &)si){};
                         seq = &s;
 …
                 } // post: elts = null
                 void ?{}( SeqIterRev(T) & si, Sequence(T) & s, T & start ) with( si ) {
+                void ?{}( SeqIterRev(T) & si, Sequence(T) & s, T & start ) with( si ) {
                         ((ColIter &)si){};
                         seq = &s;

libcfa/src/bits/stack.hfa

-              rb6a8b31
+              rd95969a
 // instead of being null.
 forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
         struct Stack {
                 inline Collection;                                                              // Plan 9 inheritance
 …
 // order returned by drop().
 forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
         struct StackIter {
                 inline ColIter;                                                                 // Plan 9 inheritance

libcfa/src/common.cfa

rb6a8b31	rd95969a
23	23	[ long int, long int ] div( long int num, long int denom ) { ldiv_t qr = ldiv( num, denom ); return [ qr.quot, qr.rem ]; }
24	24	[ long long int, long long int ] div( long long int num, long long int denom ) { lldiv_t qr = lldiv( num, denom ); return [ qr.quot, qr.rem ]; }
25		forall( ~~otype~~ T \| { T ?/?( T, T ); T ?%?( T, T ); } )
	25	forall( T \| { T ?/?( T, T ); T ?%?( T, T ); } )
26	26	[ T, T ] div( T num, T denom ) { return [ num / denom, num % denom ]; }
27	27

libcfa/src/common.hfa

-              rb6a8b31
+              rd95969a
 [ long int, long int ] div( long int num, long int denom );
 [ long long int, long long int ] div( long long int num, long long int denom );
 forall( otype T | { T ?/?( T, T ); T ?%?( T, T ); } )
+forall( T | { T ?/?( T, T ); T ?%?( T, T ); } )
 [ T, T ] div( T num, T demon );
 …
 } // distribution
 forall( otype T | { void ?{}( T &, zero_t ); int ?<?( T, T ); T -?( T ); } )
+forall( T | { void ?{}( T &, zero_t ); int ?<?( T, T ); T -?( T ); } )
 T abs( T );
 …
         intptr_t min( intptr_t t1, intptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
         uintptr_t min( uintptr_t t1, uintptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
         forall( otype T | { int ?<?( T, T ); } )
+        forall( T | { int ?<?( T, T ); } )
         T min( T t1, T t2 ) { return t1 < t2 ? t1 : t2; }
 …
         intptr_t max( intptr_t t1, intptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
         uintptr_t max( uintptr_t t1, uintptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
         forall( otype T | { int ?>?( T, T ); } )
+        forall( T | { int ?>?( T, T ); } )
         T max( T t1, T t2 ) { return t1 > t2 ? t1 : t2; }
         forall( otype T | { T min( T, T ); T max( T, T ); } )
+        forall( T | { T min( T, T ); T max( T, T ); } )
         T clamp( T value, T min_val, T max_val ) { return max( min_val, min( value, max_val ) ); }
         forall( otype T )
+        forall( T )
         void swap( T & v1, T & v2 ) { T temp = v1; v1 = v2; v2 = temp; }
 } // distribution

libcfa/src/concurrency/coroutine.cfa

-              rb6a8b31
+              rd95969a
 //-----------------------------------------------------------------------------
 FORALL_DATA_INSTANCE(CoroutineCancelled, (dtype coroutine_t), (coroutine_t))
 forall(dtype T)
+FORALL_DATA_INSTANCE(CoroutineCancelled, (coroutine_t &), (coroutine_t))
+forall(T &)
 void mark_exception(CoroutineCancelled(T) *) {}
 forall(dtype T)
+forall(T &)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src) {
         dst->virtual_table = src->virtual_table;
 …
+}
 forall(dtype T)
+forall(T &)
 const char * msg(CoroutineCancelled(T) *) {
         return "CoroutineCancelled(...)";
 …
 // This code should not be inlined. It is the error path on resume.
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc ) {
         verify( desc->cancellation );
 …
 // Part of the Public API
 // Not inline since only ever called once per coroutine
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void prime(T& cor) {
         $coroutine* this = get_coroutine(cor);

libcfa/src/concurrency/coroutine.hfa

-              rb6a8b31
+              rd95969a
 //-----------------------------------------------------------------------------
 // Exception thrown from resume when a coroutine stack is cancelled.
 FORALL_DATA_EXCEPTION(CoroutineCancelled, (dtype coroutine_t), (coroutine_t)) (
+FORALL_DATA_EXCEPTION(CoroutineCancelled, (coroutine_t &), (coroutine_t)) (
         coroutine_t * the_coroutine;
         exception_t * the_exception;
 );
 forall(dtype T)
+forall(T &)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src);
 forall(dtype T)
+forall(T &)
 const char * msg(CoroutineCancelled(T) *);
 …
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
 trait is_coroutine(dtype T | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
+trait is_coroutine(T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
         void main(T & this);
         $coroutine * get_coroutine(T & this);
 …
 //-----------------------------------------------------------------------------
 // Public coroutine API
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void prime(T & cor);
 …
         void __cfactx_invoke_coroutine(void (*main)(void *), void * this);
         forall(dtype T)
+        forall(T &)
         void __cfactx_start(void (*main)(T &), struct $coroutine * cor, T & this, void (*invoke)(void (*main)(void *), void *));
 …
+}
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc );
 // Resume implementation inlined for performance
 forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 static inline T & resume(T & cor) {
         // optimization : read TLS once and reuse it

libcfa/src/concurrency/future.hfa

-              rb6a8b31
+              rd95969a
 #include "monitor.hfa"
 forall( otype T ) {
+forall( T ) {
         struct future {
                 inline future_t;
 …
+}
 forall( otype T ) {
+forall( T ) {
         monitor multi_future {
                 inline future_t;

libcfa/src/concurrency/io.cfa

-              rb6a8b31
+              rd95969a
         #include "io/types.hfa"
         static const char * opcodes[] = {
+        __attribute__((unused)) static const char * opcodes[] = {
                 "OP_NOP",
                 "OP_READV",
 …
                         __cfadbg_print_safe(io_core, "Kernel I/O : IO_URING enter %d %u %u\n", ring.fd, to_submit, flags);
                         ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, 0, flags, (sigset_t *)0p, _NSIG / 8);
+                        __cfadbg_print_safe(io_core, "Kernel I/O : IO_URING %d returned %d\n", ring.fd, ret);
                         if( ret < 0 ) {
                                 switch((int)errno) {
                                 case EAGAIN:
                                 case EINTR:
+                                case EBUSY:
                                         ret = -1;
                                         break;
 …
                 __cfadbg_print_safe(io_core, "Kernel I/O : Fast poller %d (%p) stopping\n", this.ring->fd, &this);
+                __ioctx_unregister( this );
+        }
 …
                         block++;
-                        abort( "Kernel I/O : all submit queue entries used, yielding\n" );
                         yield();
 …
                                 sqe->flags,
                                 sqe->ioprio,
                                 sqe->off,
                                 sqe->addr,
+                                (void*)sqe->off,
+                                (void*)sqe->addr,
                                 sqe->len,
                                 sqe->accept_flags,
 …
+                }
                 else if( ring.eager_submits ) {
                         __u32 picked = __submit_to_ready_array( ring, idx, mask );
+                        __attribute__((unused)) __u32 picked = __submit_to_ready_array( ring, idx, mask );
                         #if defined(LEADER_LOCK)
 …
                                         sqe->flags,
                                         sqe->ioprio,
                                         sqe->off,
                                         sqe->addr,
+                                        (void*)sqe->off,
+                                        (void*)sqe->addr,
                                         sqe->len,
                                         sqe->accept_flags,
 …
                         __atomic_thread_fence( __ATOMIC_SEQ_CST );
                         // Release the consumed SQEs
                         __release_consumed_submission( ring );
                         // ring.submit_q.sqes[idx].user_data = 3ul64;

libcfa/src/concurrency/io/setup.cfa

-              rb6a8b31
+              rd95969a
         void ^?{}(io_context & this, bool cluster_context) {}
+        void register_fixed_files( io_context &, int *, unsigned ) {}
+        void register_fixed_files( cluster    &, int *, unsigned ) {}
 #else
         #include <errno.h>
 …
         static struct {
+                pthread_t     thrd;    // pthread handle to io poller thread
+                void *        stack;   // pthread stack for io poller thread
+                int           epollfd; // file descriptor to the epoll instance
+                volatile bool run;     // Whether or not to continue
+                      pthread_t  thrd;    // pthread handle to io poller thread
+                      void *     stack;   // pthread stack for io poller thread
+                      int        epollfd; // file descriptor to the epoll instance
+                volatile     bool run;     // Whether or not to continue
+                volatile     bool stopped; // Whether the poller has finished running
+                volatile uint64_t epoch;   // Epoch used for memory reclamation
         } iopoll;
 …
                 __cfadbg_print_safe(io_core, "Kernel : Starting io poller thread\n" );
+                iopoll.run = true;
+                iopoll.stack = __create_pthread( &iopoll.thrd, iopoll_loop, 0p );
+                iopoll.stack   = __create_pthread( &iopoll.thrd, iopoll_loop, 0p );
+                iopoll.run     = true;
+                iopoll.stopped = false;
+                iopoll.epoch   = 0;
+        }
 …
                 while( iopoll.run ) {
                         __cfadbg_print_safe(io_core, "Kernel I/O - epoll : waiting on io_uring contexts\n");
+                        // increment the epoch to notify any deleters we are starting a new cycle
+                        __atomic_fetch_add(&iopoll.epoch, 1, __ATOMIC_SEQ_CST);
                         // Wait for events
 …
+                        }
+                }
+                __atomic_store_n(&iopoll.stopped, true, __ATOMIC_SEQ_CST);
                 __cfadbg_print_safe(io_core, "Kernel : IO poller thread stopping\n" );
 …
 // I/O Context Sleep
 //=============================================================================================
-        #define IOEVENTS EPOLLIN | EPOLLONESHOT
         static inline void __ioctx_epoll_ctl($io_ctx_thread & ctx, int op, const char * error) {
                 struct epoll_event ev;
                 ev.events = IOEVENTS;
+                ev.events = EPOLLIN | EPOLLONESHOT;
                 ev.data.u64 = (__u64)&ctx;
                 int ret = epoll_ctl(iopoll.epollfd, op, ctx.ring->efd, &ev);
 …
+        }
+        void __ioctx_unregister($io_ctx_thread & ctx) {
+                // Read the current epoch so we know when to stop
+                size_t curr = __atomic_load_n(&iopoll.epoch, __ATOMIC_SEQ_CST);
+                // Remove the fd from the iopoller
+                __ioctx_epoll_ctl(ctx, EPOLL_CTL_DEL, "REMOVE");
+                // Notify the io poller thread of the shutdown
+                iopoll.run = false;
+                sigval val = { 1 };
+                pthread_sigqueue( iopoll.thrd, SIGUSR1, val );
+                // Make sure all this is done
+                __atomic_thread_fence(__ATOMIC_SEQ_CST);
+                // Wait for the next epoch
+                while(curr == iopoll.epoch && !iopoll.stopped) Pause();
+        }
 //=============================================================================================
 // I/O Context Misc Setup
 …
                 int ret = syscall( __NR_io_uring_register, ctx.thrd.ring->fd, IORING_REGISTER_FILES, files, count );
                 if( ret < 0 ) {
                         abort( "KERNEL ERROR: IO_URING SYSCALL - (%d) %s\n", (int)errno, strerror(errno) );
+                        abort( "KERNEL ERROR: IO_URING REGISTER - (%d) %s\n", (int)errno, strerror(errno) );
+                }

libcfa/src/concurrency/io/types.hfa

-              rb6a8b31
+              rd95969a
 // file "LICENCE" distributed with Cforall.
 //
+// io/types.hfa --
+// io/types.hfa -- PRIVATE
+// Types used by the I/O subsystem
 //
 // Author           : Thierry Delisle
 …
 #include "bits/locks.hfa"
+#include "kernel/fwd.hfa"
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
 …
         struct $io_ctx_thread;
         void __ioctx_register($io_ctx_thread & ctx);
+        void __ioctx_unregister($io_ctx_thread & ctx);
         void __ioctx_prepare_block($io_ctx_thread & ctx);
         void __sqe_clean( volatile struct io_uring_sqe * sqe );

libcfa/src/concurrency/kernel.cfa

-              rb6a8b31
+              rd95969a
                 preemption_scope scope = { this };
+                #if !defined(__CFA_NO_STATISTICS__)
+                        unsigned long long last_tally = rdtscl();
+                #endif
                 __cfadbg_print_safe(runtime_core, "Kernel : core %p started\n", this);
 …
                         // Are we done?
                         if( __atomic_load_n(&this->do_terminate, __ATOMIC_SEQ_CST) ) break MAIN_LOOP;
+                        #if !defined(__CFA_NO_STATISTICS__)
+                                unsigned long long curr = rdtscl();
+                                if(curr > (last_tally + 500000000)) {
+                                        __tally_stats(this->cltr->stats, __cfaabi_tls.this_stats);
+                                        last_tally = curr;
+                                }
+                        #endif
+                }
 …
+        }
         V( this->terminated );
+        post( this->terminated );
         if(this == mainProcessor) {
 …
 // Unexpected Terminating logic
 //=============================================================================================
+static __spinlock_t kernel_abort_lock;
+static bool kernel_abort_called = false;
+void * kernel_abort(void) __attribute__ ((__nothrow__)) {
+        // abort cannot be recursively entered by the same or different processors because all signal handlers return when
+        // the globalAbort flag is true.
+        lock( kernel_abort_lock __cfaabi_dbg_ctx2 );
+        // disable interrupts, it no longer makes sense to try to interrupt this processor
+        disable_interrupts();
+        // first task to abort ?
+        if ( kernel_abort_called ) {                    // not first task to abort ?
+                unlock( kernel_abort_lock );
+                sigset_t mask;
+                sigemptyset( &mask );
+                sigaddset( &mask, SIGALRM );            // block SIGALRM signals
+                sigaddset( &mask, SIGUSR1 );            // block SIGALRM signals
+                sigsuspend( &mask );                            // block the processor to prevent further damage during abort
+                _exit( EXIT_FAILURE );                          // if processor unblocks before it is killed, terminate it
+        }
+        else {
+                kernel_abort_called = true;
+                unlock( kernel_abort_lock );
+        }
+        return __cfaabi_tls.this_thread;
+}
+void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
+        $thread * thrd = ( $thread * ) kernel_data;
+void __kernel_abort_msg( char * abort_text, int abort_text_size ) {
+        $thread * thrd = __cfaabi_tls.this_thread;
         if(thrd) {
 …
+}
 int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
         return get_coroutine(kernelTLS().this_thread) == get_coroutine(mainThread) ? 4 : 2;
+int __kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
+        return get_coroutine(__cfaabi_tls.this_thread) == get_coroutine(mainThread) ? 4 : 2;
+}
 …
 // Kernel Utilities
 //=============================================================================================
-//-----------------------------------------------------------------------------
-// Locks
-void  ?{}( semaphore & this, int count = 1 ) {
-        (this.lock){};
-        this.count = count;
-        (this.waiting){};
+}
-void ^?{}(semaphore & this) {}
-bool P(semaphore & this) with( this ){
-        lock( lock __cfaabi_dbg_ctx2 );
-        count -= 1;
-        if ( count < 0 ) {
-                // queue current task
-                append( waiting, active_thread() );
-                // atomically release spin lock and block
-                unlock( lock );
-                park();
-                return true;
+        }
-        else {
-            unlock( lock );
-            return false;
+        }
+}
-bool V(semaphore & this) with( this ) {
-        $thread * thrd = 0p;
-        lock( lock __cfaabi_dbg_ctx2 );
-        count += 1;
-        if ( count <= 0 ) {
-                // remove task at head of waiting list
-                thrd = pop_head( waiting );
+        }
-        unlock( lock );
-        // make new owner
-        unpark( thrd );
-        return thrd != 0p;
+}
-bool V(semaphore & this, unsigned diff) with( this ) {
-        $thread * thrd = 0p;
-        lock( lock __cfaabi_dbg_ctx2 );
-        int release = max(-count, (int)diff);
-        count += diff;
-        for(release) {
-                unpark( pop_head( waiting ) );
+        }
-        unlock( lock );
-        return thrd != 0p;
+}
 //-----------------------------------------------------------------------------
 // Debug

libcfa/src/concurrency/kernel.hfa

-              rb6a8b31
+              rd95969a
 // file "LICENCE" distributed with Cforall.
 //
 // kernel --
+// kernel -- Header containing the core of the kernel API
 //
 // Author           : Thierry Delisle
 …
 extern "C" {
         #include <bits/pthreadtypes.h>
+        #include <pthread.h>
         #include <linux/types.h>
+}
 //-----------------------------------------------------------------------------
+// Locks
+struct semaphore {
+        __spinlock_t lock;
+        int count;
+        __queue_t($thread) waiting;
+};
+void  ?{}(semaphore & this, int count = 1);
+void ^?{}(semaphore & this);
+bool   P (semaphore & this);
+bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);
+// Underlying Locks
+#ifdef __CFA_WITH_VERIFY__
+        extern bool __cfaabi_dbg_in_kernel();
+#endif
+extern "C" {
+        char * strerror(int);
+}
+#define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
+struct __bin_sem_t {
+        pthread_mutex_t         lock;
+        pthread_cond_t          cond;
+        int                     val;
+};
+static inline void ?{}(__bin_sem_t & this) with( this ) {
+        // Create the mutex with error checking
+        pthread_mutexattr_t mattr;
+        pthread_mutexattr_init( &mattr );
+        pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
+        pthread_mutex_init(&lock, &mattr);
+        pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
+        val = 0;
+}
+static inline void ^?{}(__bin_sem_t & this) with( this ) {
+        CHECKED( pthread_mutex_destroy(&lock) );
+        CHECKED( pthread_cond_destroy (&cond) );
+}
+static inline void wait(__bin_sem_t & this) with( this ) {
+        verify(__cfaabi_dbg_in_kernel());
+        CHECKED( pthread_mutex_lock(&lock) );
+                while(val < 1) {
+                        pthread_cond_wait(&cond, &lock);
+                }
+                val -= 1;
+        CHECKED( pthread_mutex_unlock(&lock) );
+}
+static inline bool post(__bin_sem_t & this) with( this ) {
+        bool needs_signal = false;
+        CHECKED( pthread_mutex_lock(&lock) );
+                if(val < 1) {
+                        val += 1;
+                        pthread_cond_signal(&cond);
+                        needs_signal = true;
+                }
+        CHECKED( pthread_mutex_unlock(&lock) );
+        return needs_signal;
+}
+#undef CHECKED
 …
         // Termination synchronisation (user semaphore)
         semaphore terminated;
+        oneshot terminated;
         // pthread Stack

libcfa/src/concurrency/kernel/fwd.hfa

-              rb6a8b31
+              rd95969a
 // file "LICENCE" distributed with Cforall.
 //
+// kernel/fwd.hfa --
+// kernel/fwd.hfa -- PUBLIC
+// Fundamental code needed to implement threading M.E.S. algorithms.
 //
 // Author           : Thierry Delisle
 …
                 extern uint64_t thread_rand();
+                // Semaphore which only supports a single thread
+                struct single_sem {
+                        struct $thread * volatile ptr;
+                };
+                static inline {
+                        void  ?{}(single_sem & this) {
+                                this.ptr = 0p;
+                        }
+                        void ^?{}(single_sem &) {}
+                        bool wait(single_sem & this) {
+                                for() {
+                                        struct $thread * expected = this.ptr;
+                                        if(expected == 1p) {
+                                                if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                        return false;
+                                                }
+                                        }
+                                        else {
+                                                /* paranoid */ verify( expected == 0p );
+                                                if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                        park();
+                                                        return true;
+                                                }
+                                        }
+                                }
+                        }
+                        bool post(single_sem & this) {
+                                for() {
+                                        struct $thread * expected = this.ptr;
+                                        if(expected == 1p) return false;
+                                        if(expected == 0p) {
+                                                if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                        return false;
+                                                }
+                                        }
+                                        else {
+                                                if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                        unpark( expected );
+                                                        return true;
+                                                }
+                                        }
+                                }
+                        }
+                }
+                // Synchronozation primitive which only supports a single thread and one post
+                // Similar to a binary semaphore with a 'one shot' semantic
+                // is expected to be discarded after each party call their side
+                struct oneshot {
+                        // Internal state :
+                        //     0p     : is initial state (wait will block)
+                        //     1p     : fulfilled (wait won't block)
+                        // any thread : a thread is currently waiting
+                        struct $thread * volatile ptr;
+                };
+                static inline {
+                        void  ?{}(oneshot & this) {
+                                this.ptr = 0p;
+                        }
+                        void ^?{}(oneshot &) {}
+                        // Wait for the post, return immidiately if it already happened.
+                        // return true if the thread was parked
+                        bool wait(oneshot & this) {
+                                for() {
+                                        struct $thread * expected = this.ptr;
+                                        if(expected == 1p) return false;
+                                        /* paranoid */ verify( expected == 0p );
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                park();
+                                                /* paranoid */ verify( this.ptr == 1p );
+                                                return true;
+                                        }
+                                }
+                        }
+                        // Mark as fulfilled, wake thread if needed
+                        // return true if a thread was unparked
+                        bool post(oneshot & this) {
+                                struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+                                if( got == 0p ) return false;
+                                unpark( got );
+                                return true;
+                        }
+                }
+                // base types for future to build upon
+                // It is based on the 'oneshot' type to allow multiple futures
+                // to block on the same instance, permitting users to block a single
+                // thread on "any of" [a given set of] futures.
+                // does not support multiple threads waiting on the same future
+                struct future_t {
+                        // Internal state :
+                        //     0p      : is initial state (wait will block)
+                        //     1p      : fulfilled (wait won't block)
+                        //     2p      : in progress ()
+                        //     3p      : abandoned, server should delete
+                        // any oneshot : a context has been setup to wait, a thread could wait on it
+                        struct oneshot * volatile ptr;
+                };
+                static inline {
+                        void  ?{}(future_t & this) {
+                                this.ptr = 0p;
+                        }
+                        void ^?{}(future_t &) {}
+                        void reset(future_t & this) {
+                                // needs to be in 0p or 1p
+                                __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                        }
+                        // check if the future is available
+                        bool available( future_t & this ) {
+                                return this.ptr == 1p;
+                        }
+                        // Prepare the future to be waited on
+                        // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+                        bool setup( future_t & this, oneshot & wait_ctx ) {
+                                /* paranoid */ verify( wait_ctx.ptr == 0p );
+                                // The future needs to set the wait context
+                                for() {
+                                        struct oneshot * expected = this.ptr;
+                                        // Is the future already fulfilled?
+                                        if(expected == 1p) return false; // Yes, just return false (didn't block)
+                                        // The future is not fulfilled, try to setup the wait context
+                                        /* paranoid */ verify( expected == 0p );
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                return true;
+                                        }
+                                }
+                        }
+                        // Stop waiting on a future
+                        // When multiple futures are waited for together in "any of" pattern
+                        // futures that weren't fulfilled before the thread woke up
+                        // should retract the wait ctx
+                        // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+                        void retract( future_t & this, oneshot & wait_ctx ) {
+                                // Remove the wait context
+                                struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                                // got == 0p: future was never actually setup, just return
+                                if( got == 0p ) return;
+                                // got == wait_ctx: since fulfil does an atomic_swap,
+                                // if we got back the original then no one else saw context
+                                // It is safe to delete (which could happen after the return)
+                                if( got == &wait_ctx ) return;
+                                // got == 1p: the future is ready and the context was fully consumed
+                                // the server won't use the pointer again
+                                // It is safe to delete (which could happen after the return)
+                                if( got == 1p ) return;
+                                // got == 2p: the future is ready but the context hasn't fully been consumed
+                                // spin until it is safe to move on
+                                if( got == 2p ) {
+                                        while( this.ptr != 1p ) Pause();
+                                        return;
+                                }
+                                // got == any thing else, something wen't wrong here, abort
+                                abort("Future in unexpected state");
+                        }
+                        // Mark the future as abandoned, meaning it will be deleted by the server
+                        bool abandon( future_t & this ) {
+                                /* paranoid */ verify( this.ptr != 3p );
+                                // Mark the future as abandonned
+                                struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
+                                // If the future isn't already fulfilled, let the server delete it
+                                if( got == 0p ) return false;
+                                // got == 2p: the future is ready but the context hasn't fully been consumed
+                                // spin until it is safe to move on
+                                if( got == 2p ) {
+                                        while( this.ptr != 1p ) Pause();
+                                        got = 1p;
+                                }
+                                // The future is completed delete it now
+                                /* paranoid */ verify( this.ptr != 1p );
+                                free( &this );
+                                return true;
+                        }
+                        // from the server side, mark the future as fulfilled
+                        // delete it if needed
+                        bool fulfil( future_t & this ) {
+                                for() {
+                                        struct oneshot * expected = this.ptr;
+                                        // was this abandoned?
+                                        #if defined(__GNUC__) && __GNUC__ >= 7
+                                                #pragma GCC diagnostic push
+                                                #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+                                        #endif
+                                                if( expected == 3p ) { free( &this ); return false; }
+                                        #if defined(__GNUC__) && __GNUC__ >= 7
+                                                #pragma GCC diagnostic pop
+                                        #endif
+                                        /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
+                                        /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
+                                        // If there is a wait context, we need to consume it and mark it as consumed after
+                                        // If there is no context then we can skip the in progress phase
+                                        struct oneshot * want = expected == 0p ? 1p : 2p;
+                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                                if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
+                                                bool ret = post( *expected );
+                                                __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+                                                return ret;
+                                        }
+                                }
+                        }
+                        // Wait for the future to be fulfilled
+                        bool wait( future_t & this ) {
+                                oneshot temp;
+                                if( !setup(this, temp) ) return false;
+                                // Wait context is setup, just wait on it
+                                bool ret = wait( temp );
+                                // Wait for the future to tru
+                                while( this.ptr == 2p ) Pause();
+                                // Make sure the state makes sense
+                                // Should be fulfilled, could be in progress but it's out of date if so
+                                // since if that is the case, the oneshot was fulfilled (unparking this thread)
+                                // and the oneshot should not be needed any more
+                                __attribute__((unused)) struct oneshot * was = this.ptr;
+                                /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
+                                // Mark the future as fulfilled, to be consistent
+                                // with potential calls to avail
+                                // this.ptr = 1p;
+                                return ret;
+                        }
+                }
                 //-----------------------------------------------------------------------
                 // Statics call at the end of each thread to register statistics

libcfa/src/concurrency/kernel/startup.cfa

-              rb6a8b31
+              rd95969a
         void ?{}(processor & this) with( this ) {
                 ( this.idle ){};
                 ( this.terminated ){ 0 };
+                ( this.terminated ){};
                 ( this.runner ){};
                 init( this, "Main Processor", *mainCluster );
 …
 void ?{}(processor & this, const char name[], cluster & _cltr) {
         ( this.idle ){};
         ( this.terminated ){ 0 };
+        ( this.terminated ){};
         ( this.runner ){};
 …
                 __wake_proc( &this );
                 P( terminated );
+                wait( terminated );
                 /* paranoid */ verify( active_processor() != &this);
+        }

libcfa/src/concurrency/locks.cfa

-              rb6a8b31
+              rd95969a
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// locks.hfa -- LIBCFATHREAD
+// Runtime locks that used with the runtime thread system.
+//
+// Author           : Colby Alexander Parsons
+// Created On       : Thu Jan 21 19:46:50 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
+#define __cforall_thread__
 #include "locks.hfa"
 #include "kernel_private.hfa"
 …
 //-----------------------------------------------------------------------------
 // info_thread
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         struct info_thread {
                 // used to put info_thread on a dl queue (aka sequence)
 …
 void ^?{}( blocking_lock & this ) {}
+void  ?{}( single_acquisition_lock & this ) {((blocking_lock &)this){ false, false };}
+void ^?{}( single_acquisition_lock & this ) {}
+void  ?{}( owner_lock & this ) {((blocking_lock &)this){ true, true };}
+void ^?{}( owner_lock & this ) {}
+void  ?{}( multiple_acquisition_lock & this ) {((blocking_lock &)this){ true, false };}
+void ^?{}( multiple_acquisition_lock & this ) {}
 void lock( blocking_lock & this ) with( this ) {
 …
 //-----------------------------------------------------------------------------
-// Overloaded routines for traits
-// These routines are temporary until an inheritance bug is fixed
-void   lock      ( single_acquisition_lock & this ) { lock   ( (blocking_lock &)this ); }
-void   unlock    ( single_acquisition_lock & this ) { unlock ( (blocking_lock &)this ); }
-void   on_wait   ( single_acquisition_lock & this ) { on_wait( (blocking_lock &)this ); }
-void   on_notify ( single_acquisition_lock & this, struct $thread * t ) { on_notify( (blocking_lock &)this, t ); }
-void   set_recursion_count( single_acquisition_lock & this, size_t recursion ) { set_recursion_count( (blocking_lock &)this, recursion ); }
-size_t get_recursion_count( single_acquisition_lock & this ) { return get_recursion_count( (blocking_lock &)this ); }
-void   lock     ( owner_lock & this ) { lock   ( (blocking_lock &)this ); }
-void   unlock   ( owner_lock & this ) { unlock ( (blocking_lock &)this ); }
-void   on_wait  ( owner_lock & this ) { on_wait( (blocking_lock &)this ); }
-void   on_notify( owner_lock & this, struct $thread * t ) { on_notify( (blocking_lock &)this, t ); }
-void   set_recursion_count( owner_lock & this, size_t recursion ) { set_recursion_count( (blocking_lock &)this, recursion ); }
-size_t get_recursion_count( owner_lock & this ) { return get_recursion_count( (blocking_lock &)this ); }
-void   lock     ( multiple_acquisition_lock & this ) { lock   ( (blocking_lock &)this ); }
-void   unlock   ( multiple_acquisition_lock & this ) { unlock ( (blocking_lock &)this ); }
-void   on_wait  ( multiple_acquisition_lock & this ) { on_wait( (blocking_lock &)this ); }
-void   on_notify( multiple_acquisition_lock & this, struct $thread * t ){ on_notify( (blocking_lock &)this, t ); }
-void   set_recursion_count( multiple_acquisition_lock & this, size_t recursion ){ set_recursion_count( (blocking_lock &)this, recursion ); }
-size_t get_recursion_count( multiple_acquisition_lock & this ){ return get_recursion_count( (blocking_lock &)this ); }
-//-----------------------------------------------------------------------------
 // alarm node wrapper
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         struct alarm_node_wrap {
                 alarm_node_t alarm_node;
 …
 //-----------------------------------------------------------------------------
 // condition variable
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         void ?{}( condition_variable(L) & this ){
 …
         bool wait( condition_variable(L) & this, L & l, uintptr_t info, Time time         ) with(this) { WAIT_TIME( info, &l , time ) }
+}
+//-----------------------------------------------------------------------------
+// Semaphore
+void  ?{}( semaphore & this, int count = 1 ) {
+        (this.lock){};
+        this.count = count;
+        (this.waiting){};
+}
+void ^?{}(semaphore & this) {}
+bool P(semaphore & this) with( this ){
+        lock( lock __cfaabi_dbg_ctx2 );
+        count -= 1;
+        if ( count < 0 ) {
+                // queue current task
+                append( waiting, active_thread() );
+                // atomically release spin lock and block
+                unlock( lock );
+                park();
+                return true;
+        }
+        else {
+            unlock( lock );
+            return false;
+        }
+}
+bool V(semaphore & this) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        count += 1;
+        if ( count <= 0 ) {
+                // remove task at head of waiting list
+                thrd = pop_head( waiting );
+        }
+        unlock( lock );
+        // make new owner
+        unpark( thrd );
+        return thrd != 0p;
+}
+bool V(semaphore & this, unsigned diff) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        int release = max(-count, (int)diff);
+        count += diff;
+        for(release) {
+                unpark( pop_head( waiting ) );
+        }
+        unlock( lock );
+        return thrd != 0p;
+}

libcfa/src/concurrency/locks.hfa

-              rb6a8b31
+              rd95969a
+//
+// Cforall Version 1.0.0 Copyright (C) 2021 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// locks.hfa -- PUBLIC
+// Runtime locks that used with the runtime thread system.
+//
+// Author           : Colby Alexander Parsons
+// Created On       : Thu Jan 21 19:46:50 2021
+// Last Modified By :
+// Last Modified On :
+// Update Count     :
+//
 #pragma once
 #include <stdbool.h>
+#include "bits/locks.hfa"
+#include "bits/sequence.hfa"
+#include "invoke.h"
+#include "bits/weakso_locks.hfa"
 #include "time_t.hfa"
 #include "time.hfa"
+//----------
+struct single_acquisition_lock {
+        inline blocking_lock;
+};
+static inline void  ?{}( single_acquisition_lock & this ) {((blocking_lock &)this){ false, false };}
+static inline void ^?{}( single_acquisition_lock & this ) {}
+static inline void   lock      ( single_acquisition_lock & this ) { lock   ( (blocking_lock &)this ); }
+static inline void   unlock    ( single_acquisition_lock & this ) { unlock ( (blocking_lock &)this ); }
+static inline void   on_wait   ( single_acquisition_lock & this ) { on_wait( (blocking_lock &)this ); }
+static inline void   on_notify ( single_acquisition_lock & this, struct $thread * t ) { on_notify( (blocking_lock &)this, t ); }
+static inline void   set_recursion_count( single_acquisition_lock & this, size_t recursion ) { set_recursion_count( (blocking_lock &)this, recursion ); }
+static inline size_t get_recursion_count( single_acquisition_lock & this ) { return get_recursion_count( (blocking_lock &)this ); }
+//----------
+struct owner_lock {
+        inline blocking_lock;
+};
+static inline void  ?{}( owner_lock & this ) {((blocking_lock &)this){ true, true };}
+static inline void ^?{}( owner_lock & this ) {}
+static inline void   lock     ( owner_lock & this ) { lock   ( (blocking_lock &)this ); }
+static inline void   unlock   ( owner_lock & this ) { unlock ( (blocking_lock &)this ); }
+static inline void   on_wait  ( owner_lock & this ) { on_wait( (blocking_lock &)this ); }
+static inline void   on_notify( owner_lock & this, struct $thread * t ) { on_notify( (blocking_lock &)this, t ); }
+static inline void   set_recursion_count( owner_lock & this, size_t recursion ) { set_recursion_count( (blocking_lock &)this, recursion ); }
+static inline size_t get_recursion_count( owner_lock & this ) { return get_recursion_count( (blocking_lock &)this ); }
 //-----------------------------------------------------------------------------
 // is_blocking_lock
 trait is_blocking_lock(dtype L | sized(L)) {
+trait is_blocking_lock(L & | sized(L)) {
         // For synchronization locks to use when acquiring
         void on_notify( L &, struct $thread * );
 …
 // the info thread is a wrapper around a thread used
 // to store extra data for use in the condition variable
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         struct info_thread;
 …
 //-----------------------------------------------------------------------------
-// Blocking Locks
-struct blocking_lock {
-        // Spin lock used for mutual exclusion
-        __spinlock_t lock;
-        // List of blocked threads
-        Sequence( $thread ) blocked_threads;
-        // Count of current blocked threads
-        size_t wait_count;
-        // Flag if the lock allows multiple acquisition
-        bool multi_acquisition;
-        // Flag if lock can be released by non owner
-        bool strict_owner;
-        // Current thread owning the lock
-        struct $thread * owner;
-        // Number of recursion level
-        size_t recursion_count;
-};
-struct single_acquisition_lock {
-        inline blocking_lock;
-};
-struct owner_lock {
-        inline blocking_lock;
-};
-struct multiple_acquisition_lock {
-        inline blocking_lock;
-};
-void  ?{}( blocking_lock & this, bool multi_acquisition, bool strict_owner );
-void ^?{}( blocking_lock & this );
-void  ?{}( single_acquisition_lock & this );
-void ^?{}( single_acquisition_lock & this );
-void  ?{}( owner_lock & this );
-void ^?{}( owner_lock & this );
-void  ?{}( multiple_acquisition_lock & this );
-void ^?{}( multiple_acquisition_lock & this );
-void lock( blocking_lock & this );
-bool try_lock( blocking_lock & this );
-void unlock( blocking_lock & this );
-void on_notify( blocking_lock & this, struct $thread * t );
-void on_wait( blocking_lock & this );
-size_t wait_count( blocking_lock & this );
-void set_recursion_count( blocking_lock & this, size_t recursion );
-size_t get_recursion_count( blocking_lock & this );
-void lock( single_acquisition_lock & this );
-void unlock( single_acquisition_lock & this );
-void on_notify( single_acquisition_lock & this, struct $thread * t );
-void on_wait( single_acquisition_lock & this );
-void set_recursion_count( single_acquisition_lock & this, size_t recursion );
-size_t get_recursion_count( single_acquisition_lock & this );
-void lock( owner_lock & this );
-void unlock( owner_lock & this );
-void on_notify( owner_lock & this, struct $thread * t );
-void on_wait( owner_lock & this );
-void set_recursion_count( owner_lock & this, size_t recursion );
-size_t get_recursion_count( owner_lock & this );
-void lock( multiple_acquisition_lock & this );
-void unlock( multiple_acquisition_lock & this );
-void on_notify( multiple_acquisition_lock & this, struct $thread * t );
-void on_wait( multiple_acquisition_lock & this );
-void set_recursion_count( multiple_acquisition_lock & this, size_t recursion );
-size_t get_recursion_count( multiple_acquisition_lock & this );
-//-----------------------------------------------------------------------------
 // Synchronization Locks
 forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
         struct condition_variable {
                 // Spin lock used for mutual exclusion
 …
         bool wait( condition_variable(L) & this, L & l, uintptr_t info, Time time );
+}
+//-----------------------------------------------------------------------------
+// Semaphore
+struct semaphore {
+        __spinlock_t lock;
+        int count;
+        __queue_t($thread) waiting;
+};
+void  ?{}(semaphore & this, int count = 1);
+void ^?{}(semaphore & this);
+bool   P (semaphore & this);
+bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);

libcfa/src/concurrency/monitor.cfa

-              rb6a8b31
+              rd95969a
 static inline [$thread *, int] search_entry_queue( const __waitfor_mask_t &, $monitor * monitors [], __lock_size_t count );
 forall(dtype T | sized( T ))
+forall(T & | sized( T ))
 static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val );
 static inline __lock_size_t count_max    ( const __waitfor_mask_t & mask );
 …
+}
 forall(dtype T | sized( T ))
+forall(T & | sized( T ))
 static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val ) {
         if( !val ) return size;

libcfa/src/concurrency/monitor.hfa

-              rb6a8b31
+              rd95969a
 #include "stdlib.hfa"
 trait is_monitor(dtype T) {
+trait is_monitor(T &) {
         $monitor * get_monitor( T & );
         void ^?{}( T & mutex );
 …
 void ^?{}( monitor_dtor_guard_t & this );
 static inline forall( dtype T | sized(T) | { void ^?{}( T & mutex ); } )
+static inline forall( T & | sized(T) | { void ^?{}( T & mutex ); } )
 void delete( T * th ) {
         ^(*th){};

libcfa/src/concurrency/mutex.cfa

-              rb6a8b31
+              rd95969a
+}
 forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void wait(condition_variable & this, L & l) {
         lock( this.lock __cfaabi_dbg_ctx2 );
 …
 //-----------------------------------------------------------------------------
 // Scopes
 forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void lock_all  ( L * locks[], size_t count) {
         // Sort locks based on addresses
 …
+}
 forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void unlock_all( L * locks[], size_t count) {
         // Lock all

libcfa/src/concurrency/mutex.hfa

-              rb6a8b31
+              rd95969a
 };
 void ?{}(mutex_lock & this);
 void ^?{}(mutex_lock & this);
 void lock(mutex_lock & this);
 bool try_lock(mutex_lock & this);
 void unlock(mutex_lock & this);
+void ?{}(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void ^?{}(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void lock(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+bool try_lock(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void unlock(mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 // Exclusive lock - recursive
 …
 };
 void ?{}(recursive_mutex_lock & this);
 void ^?{}(recursive_mutex_lock & this);
 void lock(recursive_mutex_lock & this);
 bool try_lock(recursive_mutex_lock & this);
 void unlock(recursive_mutex_lock & this);
+void ?{}(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void ^?{}(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void lock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+bool try_lock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void unlock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 trait is_lock(dtype L | sized(L)) {
+trait is_lock(L & | sized(L)) {
         void lock  (L &);
         void unlock(L &);
 …
 };
 void ?{}(condition_variable & this);
 void ^?{}(condition_variable & this);
+void ?{}(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void ^?{}(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 void notify_one(condition_variable & this);
 void notify_all(condition_variable & this);
+void notify_one(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
+void notify_all(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 void wait(condition_variable & this);
+void wait(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 forall(dtype L | is_lock(L))
 void wait(condition_variable & this, L & l);
+forall(L & | is_lock(L))
+void wait(condition_variable & this, L & l) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 //-----------------------------------------------------------------------------
 // Scopes
 forall(dtype L | is_lock(L)) {
+forall(L & | is_lock(L)) {
         #if !defined( __TUPLE_ARRAYS_EXIST__ )
         void lock  ( L * locks [], size_t count);

libcfa/src/concurrency/preemption.cfa

-              rb6a8b31
+              rd95969a
+}
+// Prevent preemption since we are about to start terminating things
+void __kernel_abort_lock(void) {
+        signal_block( SIGUSR1 );
+}
 // Raii ctor/dtor for the preemption_scope
 // Used by thread to control when they want to receive preemption signals

libcfa/src/concurrency/stats.hfa

-              rb6a8b31
+              rd95969a
 #include <stdint.h>
+enum {
+        CFA_STATS_READY_Q  = 0x01,
+        CFA_STATS_IO = 0x02,
+};
 #if defined(__CFA_NO_STATISTICS__)
 …
         static inline void __print_stats( struct __stats_t *, int, const char *, const char *, void * ) {}
 #else
-        enum {
-                CFA_STATS_READY_Q  = 0x01,
-                #if defined(CFA_HAVE_LINUX_IO_URING_H)
-                        CFA_STATS_IO = 0x02,
-                #endif
-        };
         struct __attribute__((aligned(64))) __stats_readQ_t {

libcfa/src/concurrency/thread.cfa

-              rb6a8b31
+              rd95969a
+}
 FORALL_DATA_INSTANCE(ThreadCancelled, (dtype thread_t), (thread_t))
+FORALL_DATA_INSTANCE(ThreadCancelled, (thread_t &), (thread_t))
 forall(dtype T)
+forall(T &)
 void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src) {
         dst->virtual_table = src->virtual_table;
 …
+}
 forall(dtype T)
+forall(T &)
 const char * msg(ThreadCancelled(T) *) {
         return "ThreadCancelled";
+}
 forall(dtype T)
+forall(T &)
 static void default_thread_cancel_handler(ThreadCancelled(T) & ) {
         abort( "Unhandled thread cancellation.\n" );
+}
 forall(dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
 void ?{}( thread_dtor_guard_t & this,
                 T & thrd, void(*defaultResumptionHandler)(ThreadCancelled(T) &)) {
         $monitor * m = get_monitor(thrd);
+                T & thrd, void(*cancelHandler)(ThreadCancelled(T) &)) {
+        $monitor * m = get_monitor(thrd);
         $thread * desc = get_thread(thrd);
         // Setup the monitor guard
         void (*dtor)(T& mutex this) = ^?{};
         bool join = defaultResumptionHandler != (void(*)(ThreadCancelled(T)&))0;
+        bool join = cancelHandler != (void(*)(ThreadCancelled(T)&))0;
         (this.mg){&m, (void(*)())dtor, join};
 …
+        }
         desc->state = Cancelled;
+        if (!join) {
+                defaultResumptionHandler = default_thread_cancel_handler;
+        }
+        void(*defaultResumptionHandler)(ThreadCancelled(T) &) =
+                join ? cancelHandler : default_thread_cancel_handler;
         ThreadCancelled(T) except;
 …
 //-----------------------------------------------------------------------------
 // Starting and stopping threads
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 void __thrd_start( T & this, void (*main_p)(T &) ) {
         $thread * this_thrd = get_thread(this);
 …
 //-----------------------------------------------------------------------------
 // Support for threads that don't ues the thread keyword
 forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+forall( T & | sized(T) | is_thread(T) | { void ?{}(T&); } )
 void ?{}( scoped(T)& this ) with( this ) {
         handle{};
 …
+}
 forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+forall( T &, P... | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
 void ?{}( scoped(T)& this, P params ) with( this ) {
         handle{ params };
 …
+}
 forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this ) with( this ) {
         ^handle{};
 …
 //-----------------------------------------------------------------------------
 forall(dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
 T & join( T & this ) {
         thread_dtor_guard_t guard = { this, defaultResumptionHandler };

libcfa/src/concurrency/thread.hfa

-              rb6a8b31
+              rd95969a
 //-----------------------------------------------------------------------------
 // thread trait
 trait is_thread(dtype T) {
+trait is_thread(T &) {
         void ^?{}(T& mutex this);
         void main(T& this);
 …
 };
 FORALL_DATA_EXCEPTION(ThreadCancelled, (dtype thread_t), (thread_t)) (
+FORALL_DATA_EXCEPTION(ThreadCancelled, (thread_t &), (thread_t)) (
         thread_t * the_thread;
         exception_t * the_exception;
 );
 forall(dtype T)
+forall(T &)
 void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src);
 forall(dtype T)
+forall(T &)
 const char * msg(ThreadCancelled(T) *);
 …
 // Inline getters for threads/coroutines/monitors
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline $coroutine* get_coroutine(T & this) __attribute__((const)) { return &get_thread(this)->self_cor; }
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline $monitor  * get_monitor  (T & this) __attribute__((const)) { return &get_thread(this)->self_mon; }
 …
 extern struct cluster * mainCluster;
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 void __thrd_start( T & this, void (*)(T &) );
 …
 };
 forall( dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
 void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(ThreadCancelled(T) &) );
 void ^?{}( thread_dtor_guard_t & this );
 …
 // thread runner
 // Structure that actually start and stop threads
 forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 struct scoped {
         T handle;
 };
 forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+forall( T & | sized(T) | is_thread(T) | { void ?{}(T&); } )
 void ?{}( scoped(T)& this );
 forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+forall( T &, P... | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
 void ?{}( scoped(T)& this, P params );
 forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this );
 …
 void unpark( $thread * this );
 forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline void unpark( T & this ) { if(!&this) return; unpark( get_thread( this ) );}
 …
 //----------
 // join
 forall( dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
 T & join( T & this );

libcfa/src/containers/list.hfa

-              rb6a8b31
+              rd95969a
 #define __DLISTED_MGD_JUSTIMPL(STRUCT)
 forall( dtype tE ) {
+forall( tE & ) {
         struct $mgd_link {
                 tE *elem;
 …
                 (this.is_terminator){ 1 };
+        }
         forall ( otype tInit | { void ?{}( $mgd_link(tE) &, tInit); } )
+        forall ( tInit | { void ?{}( $mgd_link(tE) &, tInit); } )
         static inline void ?=?( $mgd_link(tE) &this, tInit i ) {
                 ^?{}( this );
 …
   __DLISTED_MGD_COMMON(STRUCT, STRUCT, $links)
 trait $dlistable(dtype Tnode, dtype Telem) {
+trait $dlistable(Tnode &, Telem &) {
         $mgd_link(Telem) & $prev_link(Tnode &);
         $mgd_link(Telem) & $next_link(Tnode &);
 …
 };
 forall (dtype Tnode, dtype Telem | $dlistable(Tnode, Telem)) {
+forall (Tnode &, Telem & | $dlistable(Tnode, Telem)) {
         // implemented as a sentinel item in an underlying cicrular list

libcfa/src/containers/maybe.cfa

-              rb6a8b31
+              rd95969a
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this) {
         this.has_value = false;
+}
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, T value) {
         this.has_value = true;
 …
+}
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, maybe(T) other) {
         this.has_value = other.has_value;
 …
+}
 forall(otype T)
+forall(T)
 maybe(T) ?=?(maybe(T) & this, maybe(T) that) {
         if (this.has_value && that.has_value) {
 …
+}
 forall(otype T)
+forall(T)
 void ^?{}(maybe(T) & this) {
         if (this.has_value) {
 …
+}
 forall(otype T)
+forall(T)
 bool ?!=?(maybe(T) this, zero_t) {
         return this.has_value;
+}
 forall(otype T)
+forall(T)
 maybe(T) maybe_value(T value) {
         return (maybe(T)){value};
+}
 forall(otype T)
+forall(T)
 maybe(T) maybe_none() {
         return (maybe(T)){};
+}
 forall(otype T)
+forall(T)
 bool has_value(maybe(T) * this) {
         return this->has_value;
+}
 forall(otype T)
+forall(T)
 T get(maybe(T) * this) {
         assertf(this->has_value, "attempt to get from maybe without value");
 …
+}
 forall(otype T)
+forall(T)
 void set(maybe(T) * this, T value) {
         if (this->has_value) {
 …
+}
 forall(otype T)
+forall(T)
 void set_none(maybe(T) * this) {
         if (this->has_value) {

libcfa/src/containers/maybe.hfa

-              rb6a8b31
+              rd95969a
 // DO NOT USE DIRECTLY!
 forall(otype T)
+forall(T)
 struct maybe {
     bool has_value;
 …
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this);
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, T value);
 forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, maybe(T) other);
 forall(otype T)
+forall(T)
 void ^?{}(maybe(T) & this);
 forall(otype T)
+forall(T)
 maybe(T) ?=?(maybe(T) & this, maybe(T) other);
 forall(otype T)
+forall(T)
 bool ?!=?(maybe(T) this, zero_t);
 /* Waiting for bug#11 to be fixed.
 forall(otype T)
+forall(T)
 maybe(T) maybe_value(T value);
 forall(otype T)
+forall(T)
 maybe(T) maybe_none();
 */
 forall(otype T)
+forall(T)
 bool has_value(maybe(T) * this);
 forall(otype T)
+forall(T)
 T get(maybe(T) * this);
 forall(otype T)
+forall(T)
 void set(maybe(T) * this, T value);
 forall(otype T)
+forall(T)
 void set_none(maybe(T) * this);

libcfa/src/containers/pair.cfa

-              rb6a8b31
+              rd95969a
 #include <containers/pair.hfa>
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?<?(R, R); int ?<?(S, S); })
 int ?<?(pair(R, S) p, pair(R, S) q) {
 …
+}
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?<?(R, R); int ?<=?(S, S); })
 int ?<=?(pair(R, S) p, pair(R, S) q) {
 …
+}
 forall(otype R, otype S | { int ?==?(R, R); int ?==?(S, S); })
+forall(R, S | { int ?==?(R, R); int ?==?(S, S); })
 int ?==?(pair(R, S) p, pair(R, S) q) {
         return p.first == q.first && p.second == q.second;
+}
 forall(otype R, otype S | { int ?!=?(R, R); int ?!=?(S, S); })
+forall(R, S | { int ?!=?(R, R); int ?!=?(S, S); })
 int ?!=?(pair(R, S) p, pair(R, S) q) {
         return p.first != q.first || p.second != q.second;
+}
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?>?(R, R); int ?>?(S, S); })
 int ?>?(pair(R, S) p, pair(R, S) q) {
 …
+}
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?>?(R, R); int ?>=?(S, S); })
 int ?>=?(pair(R, S) p, pair(R, S) q) {

libcfa/src/containers/pair.hfa

-              rb6a8b31
+              rd95969a
 #pragma once
 forall(otype R, otype S) struct pair {
+forall(R, S) struct pair {
         R first;
         S second;
 };
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?<?(R, R); int ?<?(S, S); })
 int ?<?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?<?(R, R); int ?<=?(S, S); })
 int ?<=?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S | { int ?==?(R, R); int ?==?(S, S); })
+forall(R, S | { int ?==?(R, R); int ?==?(S, S); })
 int ?==?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S | { int ?!=?(R, R); int ?!=?(S, S); })
+forall(R, S | { int ?!=?(R, R); int ?!=?(S, S); })
 int ?!=?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?>?(R, R); int ?>?(S, S); })
 int ?>?(pair(R, S) p, pair(R, S) q);
 forall(otype R, otype S
+forall(R, S
         | { int ?==?(R, R); int ?>?(R, R); int ?>=?(S, S); })
 int ?>=?(pair(R, S) p, pair(R, S) q);

libcfa/src/containers/result.cfa

-              rb6a8b31
+              rd95969a
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this) {
         this.has_value = false;
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, one_t, T value) {
         this.has_value = true;
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, zero_t, E error) {
         this.has_value = false;
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, result(T, E) other) {
         this.has_value = other.has_value;
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) ?=?(result(T, E) & this, result(T, E) that) {
         if (this.has_value && that.has_value) {
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void ^?{}(result(T, E) & this) {
         if (this.has_value) {
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 bool ?!=?(result(T, E) this, zero_t) {
         return this.has_value;
+}
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_value(T value) {
         return (result(T, E)){1, value};
+}
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_error(E error) {
         return (result(T, E)){0, error};
+}
 forall(otype T, otype E)
+forall(T, E)
 bool has_value(result(T, E) * this) {
         return this->has_value;
+}
 forall(otype T, otype E)
+forall(T, E)
 T get(result(T, E) * this) {
         assertf(this->has_value, "attempt to get from result without value");
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 E get_error(result(T, E) * this) {
         assertf(!this->has_value, "attempt to get from result without error");
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void set(result(T, E) * this, T value) {
         if (this->has_value) {
 …
+}
 forall(otype T, otype E)
+forall(T, E)
 void set_error(result(T, E) * this, E error) {
         if (this->has_value) {

libcfa/src/containers/result.hfa

-              rb6a8b31
+              rd95969a
 // DO NOT USE DIRECTLY!
 forall(otype T, otype E)
+forall(T, E)
 union inner_result{
         T value;
 …
 };
 forall(otype T, otype E)
+forall(T, E)
 struct result {
         bool has_value;
 …
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this);
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, one_t, T value);
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, zero_t, E error);
 forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, result(T, E) other);
 forall(otype T, otype E)
+forall(T, E)
 void ^?{}(result(T, E) & this);
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) ?=?(result(T, E) & this, result(T, E) other);
 forall(otype T, otype E)
+forall(T, E)
 bool ?!=?(result(T, E) this, zero_t);
 /* Wating for bug#11 to be fixed.
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_value(T value);
 forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_error(E error);
 */
 forall(otype T, otype E)
+forall(T, E)
 bool has_value(result(T, E) * this);
 forall(otype T, otype E)
+forall(T, E)
 T get(result(T, E) * this);
 forall(otype T, otype E)
+forall(T, E)
 E get_error(result(T, E) * this);
 forall(otype T, otype E)
+forall(T, E)
 void set(result(T, E) * this, T value);
 forall(otype T, otype E)
+forall(T, E)
 void set_error(result(T, E) * this, E error);

libcfa/src/containers/stackLockFree.hfa

-              rb6a8b31
+              rd95969a
 // Created On       : Wed May 13 20:58:58 2020
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Jun 14 13:25:09 2020
 // Update Count     : 64
+// Last Modified On : Wed Jan 20 20:40:03 2021
+// Update Count     : 67
 //
 …
 #include <stdint.h>
 forall( dtype T )
+forall( T & )
 union Link {
         struct {                                                                                        // 32/64-bit x 2
 …
 }; // Link
 forall( otype T | sized(T) | { Link(T) * ?`next( T * ); } ) {
+forall( T | sized(T) | { Link(T) * ?`next( T * ); } ) {
         struct StackLF {
                 Link(T) stack;
 …
                 void push( StackLF(T) & this, T & n ) with(this) {
                         *( &n )`next = stack;                                   // atomic assignment unnecessary, or use CAA
+                        *( &n )`next = stack;                                           // atomic assignment unnecessary, or use CAA
                         for () {                                                                        // busy wait
                           if ( __atomic_compare_exchange_n( &stack.atom, &( &n )`next->atom, (Link(T))@{ {&n, ( &n )`next->count + 1} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) break; // attempt to update top node
 …
+                                }
                                 if( next == 0p ) return false;
                                 link = (next)`next;
+                                link = ( next )`next;
+                        }
+                }

libcfa/src/containers/vector.cfa

rb6a8b31	rd95969a
18	18	#include <stdlib.hfa>
19	19
20		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	20	forall(T, allocator_t \| allocator_c(T, allocator_t))
21	21	void copy_internal(vector(T, allocator_t)* this, vector(T, allocator_t)* other);
22	22
23	23	//------------------------------------------------------------------------------
24	24	//Initialization
25		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	25	forall(T, allocator_t \| allocator_c(T, allocator_t))
26	26	void ?{}(vector(T, allocator_t)& this)
27	27	{
…	…
30	30	}
31	31
32		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	32	forall(T, allocator_t \| allocator_c(T, allocator_t))
33	33	void ?{}(vector(T, allocator_t)& this, vector(T, allocator_t) rhs)
34	34	{
…	…
37	37	}
38	38
39		// forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	39	// forall(T, allocator_t \| allocator_c(T, allocator_t))
40	40	// vector(T, allocator_t) ?=?(vector(T, allocator_t)* this, vector(T, allocator_t) rhs)
41	41	// {
…	…
45	45	// }
46	46
47		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	47	forall(T, allocator_t \| allocator_c(T, allocator_t))
48	48	void ^?{}(vector(T, allocator_t)& this)
49	49	{
…	…
54	54	//------------------------------------------------------------------------------
55	55	//Modifiers
56		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	56	forall(T, allocator_t \| allocator_c(T, allocator_t))
57	57	void push_back(vector(T, allocator_t)* this, T value)
58	58	{
…	…
62	62	}
63	63
64		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	64	forall(T, allocator_t \| allocator_c(T, allocator_t))
65	65	void pop_back(vector(T, allocator_t)* this)
66	66	{
…	…
69	69	}
70	70
71		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	71	forall(T, allocator_t \| allocator_c(T, allocator_t))
72	72	void clear(vector(T, allocator_t)* this)
73	73	{
…	…
82	82	//Internal Helpers
83	83
84		forall(~~otype T, otype~~ allocator_t \| allocator_c(T, allocator_t))
	84	forall(T, allocator_t \| allocator_c(T, allocator_t))
85	85	void copy_internal(vector(T, allocator_t)* this, vector(T, allocator_t)* other)
86	86	{
…	…
93	93	//------------------------------------------------------------------------------
94	94	//Allocator
95		forall(~~otype~~ T)
	95	forall(T)
96	96	void ?{}(heap_allocator(T)& this)
97	97	{
…	…
100	100	}
101	101
102		forall(~~otype~~ T)
	102	forall(T)
103	103	void ?{}(heap_allocator(T)& this, heap_allocator(T) rhs)
104	104	{
…	…
107	107	}
108	108
109		forall(~~otype~~ T)
	109	forall(T)
110	110	heap_allocator(T) ?=?(heap_allocator(T)& this, heap_allocator(T) rhs)
111	111	{
…	…
115	115	}
116	116
117		forall(~~otype~~ T)
	117	forall(T)
118	118	void ^?{}(heap_allocator(T)& this)
119	119	{
…	…
121	121	}
122	122
123		forall(~~otype~~ T)
	123	forall(T)
124	124	inline void realloc_storage(heap_allocator(T)* this, size_t size)
125	125	{

libcfa/src/containers/vector.hfa

-              rb6a8b31
+              rd95969a
 //------------------------------------------------------------------------------
 //Allocator
 forall(otype T)
+forall(T)
 struct heap_allocator
+{
 …
 };
 forall(otype T)
+forall(T)
 void ?{}(heap_allocator(T)& this);
 forall(otype T)
+forall(T)
 void ?{}(heap_allocator(T)& this, heap_allocator(T) rhs);
 forall(otype T)
+forall(T)
 heap_allocator(T) ?=?(heap_allocator(T)& this, heap_allocator(T) rhs);
 forall(otype T)
+forall(T)
 void ^?{}(heap_allocator(T)& this);
 forall(otype T)
+forall(T)
 void realloc_storage(heap_allocator(T)* this, size_t size);
 forall(otype T)
+forall(T)
 static inline T* data(heap_allocator(T)* this)
+{
 …
 //------------------------------------------------------------------------------
 //Declaration
 trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
+{
         void realloc_storage(allocator_t*, size_t);
 …
 };
 forall(otype T, otype allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
+forall(T, allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
 struct vector;
 //------------------------------------------------------------------------------
 //Initialization
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ?{}(vector(T, allocator_t)& this);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ?{}(vector(T, allocator_t)& this, vector(T, allocator_t) rhs);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 vector(T, allocator_t) ?=?(vector(T, allocator_t)& this, vector(T, allocator_t) rhs);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ^?{}(vector(T, allocator_t)& this);
 forall(otype T, otype allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
+forall(T, allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
 struct vector
+{
 …
 //------------------------------------------------------------------------------
 //Capacity
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline bool empty(vector(T, allocator_t)* this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline size_t size(vector(T, allocator_t)* this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline void reserve(vector(T, allocator_t)* this, size_t size)
+{
 …
 //------------------------------------------------------------------------------
 //Element access
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T at(vector(T, allocator_t)* this, size_t index)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T ?[?](vector(T, allocator_t)* this, size_t index)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T front(vector(T, allocator_t)* this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T back(vector(T, allocator_t)* this)
+{
 …
 //------------------------------------------------------------------------------
 //Modifiers
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void push_back(vector(T, allocator_t)* this, T value);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void pop_back(vector(T, allocator_t)* this);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void clear(vector(T, allocator_t)* this);
 //------------------------------------------------------------------------------
 //Iterators
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T* begin(vector(T, allocator_t)* this)
+{
 …
+}
 // forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+// forall(T, allocator_t | allocator_c(T, allocator_t))
 // static inline const T* cbegin(const vector(T, allocator_t)* this)
 // {
 …
 // }
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T* end(vector(T, allocator_t)* this)
+{
 …
+}
 // forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+// forall(T, allocator_t | allocator_c(T, allocator_t))
 // static inline const T* cend(const vector(T, allocator_t)* this)
 // {

libcfa/src/exception.h

-              rb6a8b31
+              rd95969a
 // implemented in the .c file either so they all have to be inline.
 trait is_exception(dtype exceptT, dtype virtualT) {
+trait is_exception(exceptT &, virtualT &) {
         /* The first field must be a pointer to a virtual table.
          * That virtual table must be a decendent of the base exception virtual table.
 …
 };
 trait is_termination_exception(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT)) {
+trait is_termination_exception(exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
         void defaultTerminationHandler(exceptT &);
 };
 trait is_resumption_exception(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT)) {
+trait is_resumption_exception(exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
         void defaultResumptionHandler(exceptT &);
 };
 forall(dtype exceptT, dtype virtualT | is_termination_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_termination_exception(exceptT, virtualT))
 static inline void $throw(exceptT & except) {
         __cfaehm_throw_terminate(
 …
+}
 forall(dtype exceptT, dtype virtualT | is_resumption_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_resumption_exception(exceptT, virtualT))
 static inline void $throwResume(exceptT & except) {
         __cfaehm_throw_resume(
 …
+}
 forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void cancel_stack(exceptT & except) __attribute__((noreturn)) {
         __cfaehm_cancel_stack( (exception_t *)&except );
+}
 forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void defaultTerminationHandler(exceptT & except) {
         return cancel_stack( except );
+}
 forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void defaultResumptionHandler(exceptT & except) {
         throw except;

libcfa/src/executor.cfa

rb6a8b31	rd95969a
7	7	#include <containers/list.hfa>
8	8
9		forall( ~~dtype T~~ \| $dlistable(T, T) ) {
	9	forall( T & \| $dlistable(T, T) ) {
10	10	monitor Buffer { // unbounded buffer
11	11	dlist( T, T ) queue; // unbounded list of work requests

libcfa/src/gmp.hfa

-              rb6a8b31
+              rd95969a
         // I/O
         forall( dtype istype | istream( istype ) )
+        forall( istype & | istream( istype ) )
                 istype & ?|?( istype & is, Int & mp ) {
                 gmp_scanf( "%Zd", &mp );
 …
         } // ?|?
         forall( dtype ostype | ostream( ostype ) ) {
+        forall( ostype & | ostream( ostype ) ) {
                 ostype & ?|?( ostype & os, Int mp ) {
                         if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );

libcfa/src/interpose.cfa

-              rb6a8b31
+              rd95969a
                 // Failure handler
+                __cfaabi_sigaction( SIGSEGV, sigHandler_segv, SA_SIGINFO | SA_ONSTACK );
+                __cfaabi_sigaction( SIGBUS , sigHandler_segv, SA_SIGINFO | SA_ONSTACK );
+                __cfaabi_sigaction( SIGILL , sigHandler_ill , SA_SIGINFO | SA_ONSTACK );
+                __cfaabi_sigaction( SIGFPE , sigHandler_fpe , SA_SIGINFO | SA_ONSTACK );
+                __cfaabi_sigaction( SIGTERM, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // one shot handler, return to default
+                __cfaabi_sigaction( SIGINT , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND );
+                __cfaabi_sigaction( SIGABRT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND );
+                __cfaabi_sigaction( SIGHUP , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // terminal hangup
+                 // internal errors
+                __cfaabi_sigaction( SIGSEGV, sigHandler_segv, SA_SIGINFO | SA_ONSTACK ); // Invalid memory reference (default: Core)
+                __cfaabi_sigaction( SIGBUS , sigHandler_segv, SA_SIGINFO | SA_ONSTACK ); // Bus error, bad memory access (default: Core)
+                __cfaabi_sigaction( SIGILL , sigHandler_ill , SA_SIGINFO | SA_ONSTACK ); // Illegal Instruction (default: Core)
+                __cfaabi_sigaction( SIGFPE , sigHandler_fpe , SA_SIGINFO | SA_ONSTACK ); // Floating-point exception (default: Core)
+                // handlers to outside errors
+                // reset in-case they insist and send it over and over
+                __cfaabi_sigaction( SIGTERM, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Termination signal (default: Term)
+                __cfaabi_sigaction( SIGINT , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Interrupt from keyboard (default: Term)
+                __cfaabi_sigaction( SIGHUP , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Hangup detected on controlling terminal or death of controlling process (default: Term)
+                __cfaabi_sigaction( SIGQUIT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Quit from keyboard (default: Core)
+                __cfaabi_sigaction( SIGABRT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Abort signal from abort(3) (default: Core)
+        }
+}
 …
+}
+void * kernel_abort( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 0p; }
 void kernel_abort_msg( void * data, char buffer[], int size ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
+// See concurrency/kernel.cfa for strong definition used in multi-processor mode.
 int kernel_abort_lastframe( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 4; }
+// See concurrency/kernel.cfa and concurrency/preemption.cfa for strong definition used in multi-processor mode.
+void __kernel_abort_lock( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
+void __kernel_abort_msg( char buffer[], int size ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
+int __kernel_abort_lastframe( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 4; }
 enum { abort_text_size = 1024 };
 …
 static void __cfaabi_backtrace( int start ) {
         enum { Frames = 50, };                                                          // maximum number of stack frames
         int last = kernel_abort_lastframe();                            // skip last N stack frames
+        int last = __kernel_abort_lastframe();                          // skip last N stack frames
         void * array[Frames];
 …
+}
 static volatile int __abort_stage = 0;
+static volatile bool __abort_first = 0;
 // Cannot forward va_list.
 void __abort( bool signalAbort, const char fmt[], va_list args ) {
+        int stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+        // First stage: stop the cforall kernel and print
+        if(stage == 1) {
+                // increment stage
+                stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+                // must be done here to lock down kernel
+                void * kernel_data = kernel_abort();
+                int len;
+                signal( SIGABRT, SIG_DFL );                                                     // prevent final "real" abort from recursing to handler
+                len = snprintf( abort_text, abort_text_size, "Cforall Runtime error (UNIX pid:%ld) ", (long int)getpid() ); // use UNIX pid (versus getPid)
+                __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+                assert( fmt );
+                len = vsnprintf( abort_text, abort_text_size, fmt, args );
+                __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+                // add optional newline if missing at the end of the format text
+                if ( fmt[strlen( fmt ) - 1] != '\n' ) {
+                        __cfaabi_bits_write( STDERR_FILENO, "\n", 1 );
+                } // if
+                kernel_abort_msg( kernel_data, abort_text, abort_text_size );
+        }
+        // Second stage: print the backtrace
+        if(stage == 2) {
+                // increment stage
+                stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
+                // print stack trace in handler
+                __cfaabi_backtrace( signalAbort ? 4 : 2 );
+        }
+        do {
+                // Finally call abort
+        // Multiple threads can come here from multiple paths
+        // To make sure this is safe any concurrent/subsequent call to abort is redirected to libc-abort
+        bool first = ! __atomic_test_and_set( &__abort_first, __ATOMIC_SEQ_CST);
+        // Prevent preemption from kicking-in and messing with the abort
+        __kernel_abort_lock();
+        // first to abort ?
+        if ( !first ) {
+                // We aren't the first to abort just let C handle it
+                signal( SIGABRT, SIG_DFL );     // restore default in case we came here through the function.
                 __cabi_libc.abort();
+                // Loop so that we never return
+        } while(true);
+        }
+        int len = snprintf( abort_text, abort_text_size, "Cforall Runtime error (UNIX pid:%ld) ", (long int)getpid() ); // use UNIX pid (versus getPid)
+        __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+        // print the cause of the error
+        assert( fmt );
+        len = vsnprintf( abort_text, abort_text_size, fmt, args );
+        __cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+        // add optional newline if missing at the end of the format text
+        if ( fmt[strlen( fmt ) - 1] != '\n' ) {
+                __cfaabi_bits_write( STDERR_FILENO, "\n", 1 );
+        } // if
+        // Give the kernel the chance to add some data in here
+        __kernel_abort_msg( abort_text, abort_text_size );
+        // print stack trace in handler
+        __cfaabi_backtrace( signalAbort ? 4 : 2 );
+        // Finally call abort
+        __cabi_libc.abort();
+}

libcfa/src/iostream.cfa

-              rb6a8b31
+              rd95969a
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, bool b ) {
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
 …
 // tuples
 forall( dtype ostype, otype T, ttype Params | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
+forall( ostype &, T, Params... | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
         ostype & ?|?( ostype & os, T arg, Params rest ) {
                 (ostype &)(os | arg);                                                   // print first argument
 …
 // writes the range [begin, end) to the given stream
 forall( dtype ostype, otype elt_type | writeable( elt_type, ostype ), otype iterator_type | iterator( iterator_type, elt_type ) ) {
+forall( ostype &, elt_type | writeable( elt_type, ostype ), iterator_type | iterator( iterator_type, elt_type ) ) {
         void write( iterator_type begin, iterator_type end, ostype & os ) {
                 void print( elt_type i ) { os | i; }
 …
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 #define IntegralFMTImpl( T, IFMTNP, IFMTP ) \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
 …
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 #define IntegralFMTImpl128( T, SIGNED, CODE, IFMTNP, IFMTP ) \
 forall( dtype ostype | ostream( ostype ) ) \
+forall( ostype & | ostream( ostype ) ) \
 static void base10_128( ostype & os, _Ostream_Manip(T) f ) { \
         if ( f.val > UINT64_MAX ) { \
 …
         } /* if */ \
 } /* base10_128 */ \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
 …
 #if defined( __SIZEOF_INT128__ )
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 forall( dtype ostype | ostream( ostype ) )
+forall( ostype & | ostream( ostype ) )
 static inline void base_128( ostype & os, unsigned int128 val, unsigned int128 power, _Ostream_Manip(uint64_t) & f, unsigned int maxdig, unsigned int bits, unsigned int cnt = 0 ) {
         int wd = 1;                                                                                     // f.wd is never 0 because 0 implies left-pad
 …
 #define IntegralFMTImpl128( T ) \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 _Ostream_Manip(uint64_t) fmt; \
 …
 #define FloatingPointFMTImpl( T, DFMTNP, DFMTP ) \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
                 if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
 …
 // *********************************** character ***********************************
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, _Ostream_Manip(char) f ) {
                 if ( f.base != 'c' ) {                                                  // bespoke binary/octal/hex format
 …
 // *********************************** C string ***********************************
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, _Ostream_Manip(const char *) f ) {
                 if ( ! f.val ) return os;                                               // null pointer ?
 …
 forall( dtype istype | istream( istype ) ) {
+forall( istype & | istream( istype ) ) {
         istype & ?|?( istype & is, bool & b ) {
                 char val[6];
 …
 // *********************************** manipulators ***********************************
 forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Cstr f ) {
         // skip xxx
 …
 } // ?|?
 forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Char f ) {
         fmt( is, "%*c" );                                                                       // argument variable unused
 …
 #define InputFMTImpl( T, CODE ) \
 forall( dtype istype | istream( istype ) ) \
+forall( istype & | istream( istype ) ) \
 istype & ?|?( istype & is, _Istream_Manip(T) f ) { \
         enum { size = 16 }; \
 …
 InputFMTImpl( long double, "Lf" )
 forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Manip(float _Complex) fc ) {
         float re, im;
 …
 } // ?|?
 forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Manip(double _Complex) dc ) {
         double re, im;
 …
 } // ?|?
 forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Manip(long double _Complex) ldc ) {
         long double re, im;

libcfa/src/iostream.hfa

-              rb6a8b31
+              rd95969a
 trait ostream( dtype ostype ) {
+trait ostream( ostype & ) {
         // private
         bool $sepPrt( ostype & );                                                       // get separator state (on/off)
 …
 }; // ostream
 // trait writeable( otype T ) {
 //      forall( dtype ostype | ostream( ostype ) ) ostype & ?|?( ostype &, T );
+// trait writeable( T ) {
+//      forall( ostype & | ostream( ostype ) ) ostype & ?|?( ostype &, T );
 // }; // writeable
 trait writeable( otype T, dtype ostype | ostream( ostype ) ) {
+trait writeable( T, ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype &, T );
 }; // writeable
 …
 // implement writable for intrinsic types
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype &, bool );
         void ?|?( ostype &, bool );
 …
 // tuples
 forall( dtype ostype, otype T, ttype Params | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
+forall( ostype &, T, Params... | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
         ostype & ?|?( ostype & os, T arg, Params rest );
         void ?|?( ostype & os, T arg, Params rest );
 …
 // writes the range [begin, end) to the given stream
 forall( dtype ostype, otype elt_type | writeable( elt_type, ostype ), otype iterator_type | iterator( iterator_type, elt_type ) ) {
+forall( ostype &, elt_type | writeable( elt_type, ostype ), iterator_type | iterator( iterator_type, elt_type ) ) {
         void write( iterator_type begin, iterator_type end, ostype & os );
         void write_reverse( iterator_type begin, iterator_type end, ostype & os );
 …
 // *********************************** manipulators ***********************************
 forall( otype T )
+forall( T )
 struct _Ostream_Manip {
         T val;                                                                                          // polymorphic base-type
 …
         _Ostream_Manip(T) & sign( _Ostream_Manip(T) & fmt ) { fmt.flags.sign = true; return fmt; } \
 } /* distribution */ \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ); \
         void ?|?( ostype & os, _Ostream_Manip(T) f ); \
 …
         _Ostream_Manip(T) & nodp( _Ostream_Manip(T) & fmt ) { fmt.flags.nobsdp = true; return fmt; } \
 } /* distribution */ \
 forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
         ostype & ?|?( ostype & os, _Ostream_Manip(T) f ); \
         void ?|?( ostype & os, _Ostream_Manip(T) f ); \
 …
         _Ostream_Manip(char) & nobase( _Ostream_Manip(char) & fmt ) { fmt.flags.nobsdp = true; return fmt; }
 } // distribution
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, _Ostream_Manip(char) f );
         void ?|?( ostype & os, _Ostream_Manip(char) f );
 …
         _Ostream_Manip(const char *) & nobase( _Ostream_Manip(const char *) & fmt ) { fmt.flags.nobsdp = true; return fmt; }
 } // distribution
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, _Ostream_Manip(const char *) f );
         void ?|?( ostype & os, _Ostream_Manip(const char *) f );
 …
 trait istream( dtype istype ) {
+trait istream( istype & ) {
         void nlOn( istype & );                                                          // read newline
         void nlOff( istype & );                                                         // scan newline
 …
 }; // istream
 trait readable( otype T ) {
         forall( dtype istype | istream( istype ) ) istype & ?|?( istype &, T );
+trait readable( T ) {
+        forall( istype & | istream( istype ) ) istype & ?|?( istype &, T );
 }; // readable
 forall( dtype istype | istream( istype ) ) {
+forall( istype & | istream( istype ) ) {
         istype & ?|?( istype &, bool & );
 …
         _Istream_Cstr & wdi( unsigned int w, _Istream_Cstr & fmt ) { fmt.wd = w; return fmt; }
 } // distribution
 forall( dtype istype | istream( istype ) ) istype & ?|?( istype & is, _Istream_Cstr f );
+forall( istype & | istream( istype ) ) istype & ?|?( istype & is, _Istream_Cstr f );
 struct _Istream_Char {
 …
         _Istream_Char & ignore( _Istream_Char & fmt ) { fmt.ignore = true; return fmt; }
 } // distribution
 forall( dtype istype | istream( istype ) ) istype & ?|?( istype & is, _Istream_Char f );
 forall( dtype T | sized( T ) )
+forall( istype & | istream( istype ) ) istype & ?|?( istype & is, _Istream_Char f );
+forall( T & | sized( T ) )
 struct _Istream_Manip {
         T & val;                                                                                        // polymorphic base-type
 …
         _Istream_Manip(T) & wdi( unsigned int w, _Istream_Manip(T) & fmt ) { fmt.wd = w; return fmt; } \
 } /* distribution */ \
 forall( dtype istype | istream( istype ) ) { \
+forall( istype & | istream( istype ) ) { \
         istype & ?|?( istype & is, _Istream_Manip(T) f ); \
 } // ?|?
 …
 #include <time_t.hfa>                                                                   // Duration (constructors) / Time (constructors)
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, Duration dur );
         void ?|?( ostype & os, Duration dur );

libcfa/src/iterator.cfa

-              rb6a8b31
+              rd95969a
 #include "iterator.hfa"
 forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each( iterator_type begin, iterator_type end, void (* func)( elt_type ) ) {
         for ( iterator_type i = begin; i != end; ++i ) {
 …
 } // for_each
 forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each_reverse( iterator_type begin, iterator_type end, void (* func)( elt_type ) ) {
         for ( iterator_type i = end; i != begin; ) {

libcfa/src/iterator.hfa

-              rb6a8b31
+              rd95969a
 // An iterator can be used to traverse a data structure.
 trait iterator( otype iterator_type, otype elt_type ) {
+trait iterator( iterator_type, elt_type ) {
         // point to the next element
 //      iterator_type ?++( iterator_type & );
 …
 };
 trait iterator_for( otype iterator_type, otype collection_type, otype elt_type | iterator( iterator_type, elt_type ) ) {
+trait iterator_for( iterator_type, collection_type, elt_type | iterator( iterator_type, elt_type ) ) {
 //      [ iterator_type begin, iterator_type end ] get_iterators( collection_type );
         iterator_type begin( collection_type );
 …
 };
 forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each( iterator_type begin, iterator_type end, void (* func)( elt_type ) );
 forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each_reverse( iterator_type begin, iterator_type end, void (* func)( elt_type ) );

libcfa/src/math.hfa

-              rb6a8b31
+              rd95969a
         unsigned long long int floor( unsigned long long int n, unsigned long long int align ) { return n / align * align; }
         // forall( otype T | { T ?/?( T, T ); T ?*?( T, T ); } )
+        // forall( T | { T ?/?( T, T ); T ?*?( T, T ); } )
         // T floor( T n, T align ) { return n / align * align; }
 …
         unsigned long long int ceiling_div( unsigned long long int n, unsigned long long int align ) { return (n + (align - 1)) / align; }
         // forall( otype T | { T ?+?( T, T ); T ?-?( T, T ); T ?%?( T, T ); } )
+        // forall( T | { T ?+?( T, T ); T ?-?( T, T ); T ?%?( T, T ); } )
         // T ceiling_div( T n, T align ) { verify( is_pow2( align ) );return (n + (align - 1)) / align; }
 …
         unsigned long long int ceiling( unsigned long long int n, unsigned long long int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
         // forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T ); T ?/?( T, T ); } )
+        // forall( T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T ); T ?/?( T, T ); } )
         // T ceiling( T n, T align ) { return return floor( n + (n % align != 0 ? align - 1 : 0), align ); *}
 …
 static inline {
         forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T );T ?*?( T, T ); } )
+        forall( T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T );T ?*?( T, T ); } )
         T lerp( T x, T y, T a ) { return x * ((T){1} - a) + y * a; }
         forall( otype T | { void ?{}( T &, zero_t ); void ?{}( T &, one_t ); int ?<?( T, T ); } )
+        forall( T | { void ?{}( T &, zero_t ); void ?{}( T &, one_t ); int ?<?( T, T ); } )
         T step( T edge, T x ) { return x < edge ? (T){0} : (T){1}; }
         forall( otype T | { void ?{}( T &, int ); T clamp( T, T, T ); T ?-?( T, T ); T ?*?( T, T ); T ?/?( T, T ); } )
+        forall( T | { void ?{}( T &, int ); T clamp( T, T, T ); T ?-?( T, T ); T ?*?( T, T ); T ?/?( T, T ); } )
         T smoothstep( T edge0, T edge1, T x ) { T t = clamp( (x - edge0) / (edge1 - edge0), (T){0}, (T){1} ); return t * t * ((T){3} - (T){2} * t); }
 } // distribution

libcfa/src/memory.cfa

-              rb6a8b31
+              rd95969a
 // Internal data object.
 forall(dtype T | sized(T), ttype Args | { void ?{}(T &, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T &, Args); })
 void ?{}(counter_data(T) & this, Args args) {
         (this.counter){1};
 …
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ^?{}(counter_data(T) & this) {
         assert(0 == this.counter);
 …
 // This is one of many pointers keeping this alive.
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 void ?{}(counter_ptr(T) & this) {
         this.data = 0p;
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 void ?{}(counter_ptr(T) & this, zero_t) {
         this.data = 0p;
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 static void internal_decrement(counter_ptr(T) & this) {
         if (this.data && 0 == --this.data->counter) {
 …
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 static void internal_copy(counter_ptr(T) & this, counter_ptr(T) & that) {
         this.data = that.data;
 …
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ?{}(counter_ptr(T) & this, counter_ptr(T) that) {
         // `that` is a copy but it should have neither a constructor
 …
+}
 forall(dtype T | sized(T), ttype Args | { void ?{}(T&, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T&, Args); })
 void ?{}(counter_ptr(T) & this, Args args) {
         this.data = (counter_data(T)*)new(args);
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ^?{}(counter_ptr(T) & this) {
         internal_decrement(this);
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 T & *?(counter_ptr(T) & this) {
         return *((this.data) ? &this.data->object : 0p);
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ?=?(counter_ptr(T) & this, counter_ptr(T) that) {
         if (this.data != that.data) {
 …
+}
 forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ?=?(counter_ptr(T) & this, zero_t) {
         internal_decrement(this);
 …
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?==?(counter_ptr(T) const & this, counter_ptr(T) const & that) {
         return this.data == that.data;
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?!=?(counter_ptr(T) const & this, counter_ptr(T) const & that) {
         return !?==?(this, that);
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?==?(counter_ptr(T) const & this, zero_t) {
         return this.data == 0;
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?!=?(counter_ptr(T) const & this, zero_t) {
         return !?==?(this, (zero_t)0);
 …
 // This is the only pointer that keeps this alive.
 forall(dtype T)
+forall(T &)
 void ?{}(unique_ptr(T) & this) {
         this.data = 0p;
+}
 forall(dtype T)
+forall(T &)
 void ?{}(unique_ptr(T) & this, zero_t) {
         this.data = 0p;
+}
 forall(dtype T | sized(T), ttype Args | { void ?{}(T &, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T &, Args); })
 void ?{}(unique_ptr(T) & this, Args args) {
         this.data = (T *)new(args);
+}
 forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void ^?{}(unique_ptr(T) & this) {
         delete(this.data);
+}
 forall(dtype T)
+forall(T &)
 T & *?(unique_ptr(T) & this) {
         return *this.data;
+}
 forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void ?=?(unique_ptr(T) & this, zero_t) {
         delete(this.data);
 …
+}
 forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void move(unique_ptr(T) & this, unique_ptr(T) & that) {
         delete(this.data);
 …
+}
 forall(dtype T)
+forall(T &)
 int ?==?(unique_ptr(T) const & this, unique_ptr(T) const & that) {
         return this.data == that.data;
+}
 forall(dtype T)
+forall(T &)
 int ?!=?(unique_ptr(T) const & this, unique_ptr(T) const & that) {
         return !?==?(this, that);
+}
 forall(dtype T)
+forall(T &)
 int ?==?(unique_ptr(T) const & this, zero_t) {
         return this.data == 0;
+}
 forall(dtype T)
+forall(T &)
 int ?!=?(unique_ptr(T) const & this, zero_t) {
         return !?==?(this, (zero_t)0);

libcfa/src/memory.hfa

-              rb6a8b31
+              rd95969a
 // Internal data object.
 forall(dtype T | sized(T)) {
+forall(T & | sized(T)) {
         struct counter_data {
                 unsigned int counter;
 …
         };
         forall(ttype Args | { void ?{}(T &, Args); })
+        forall(Args... | { void ?{}(T &, Args); })
         void ?{}(counter_data(T) & this, Args args);
 …
 // This is one of many pointers keeping this alive.
 forall(dtype T | sized(T)) {
+forall(T & | sized(T)) {
         struct counter_ptr {
                 counter_data(T) * data;
 …
         forall( | { void ^?{}(T &); })
         void ?{}(counter_ptr(T) & this, counter_ptr(T) that);
         forall(ttype Args | { void ?{}(T&, Args); })
+        forall(Args... | { void ?{}(T&, Args); })
         void ?{}(counter_ptr(T) & this, Args args);
 …
 // This is the only pointer that keeps this alive.
 forall(dtype T) {
+forall(T &) {
         struct unique_ptr {
                 T * data;
 …
         void ?{}(unique_ptr(T) & this, zero_t);
         void ?{}(unique_ptr(T) & this, unique_ptr(T) that) = void;
         forall( | sized(T), ttype Args | { void ?{}(T &, Args); })
+        forall( | sized(T), Args... | { void ?{}(T &, Args); })
         void ?{}(unique_ptr(T) & this, Args args);

libcfa/src/parseargs.cfa

-              rb6a8b31
+              rd95969a
 static void usage(char * cmd, cfa_option options[], size_t opt_count, const char * usage, FILE * out)  __attribute__ ((noreturn));
+//-----------------------------------------------------------------------------
+// checking
+static void check_args(cfa_option options[], size_t opt_count) {
+        for(i; opt_count) {
+                for(j; opt_count) {
+                        if(i == j) continue;
+                        if( options[i].short_name != '\0'
+                        && options[i].short_name == options[j].short_name)
+                                abort("Parse Args error: two options have short name '%c' (%zu & %zu)", options[i].short_name, i, j);
+                        if(0 == strcmp(options[i].long_name, options[j].long_name)) abort("Parse Args error: two options have long name '%s' (%zu & %zu)", options[i].long_name, i, j);
+                }
+        }
+}
+//-----------------------------------------------------------------------------
+// Parsing args
 void parse_args( cfa_option options[], size_t opt_count, const char * usage, char ** & left ) {
         if( 0p != &cfa_args_argc ) {
 …
+}
-//-----------------------------------------------------------------------------
-// getopt_long wrapping
 void parse_args(
         int argc,
 …
         char ** & left
 ) {
+        check_args(options, opt_count);
+        int maxv = 'h';
+        char optstring[opt_count * 3] = { '\0' };
+        {
+                int idx = 0;
+                for(i; opt_count) {
+                        if (options[i].short_name) {
+                                maxv = max(options[i].short_name, maxv);
+                                optstring[idx] = options[i].short_name;
+                                idx++;
+                                if(    ((intptr_t)options[i].parse) != ((intptr_t)parse_settrue)
+                                && ((intptr_t)options[i].parse) != ((intptr_t)parse_setfalse) ) {
+                                        optstring[idx] = ':';
+                                        idx++;
+                                }
+                        }
+                }
+                optstring[idx+0] = 'h';
+                optstring[idx+1] = '\0';
+        }
         struct option optarr[opt_count + 2];
+        {
 …
                 for(i; opt_count) {
                         if(options[i].long_name) {
+                                options[i].val = (options[i].short_name != '\0') ? ((int)options[i].short_name) : ++maxv;
                                 optarr[idx].name = options[i].long_name;
                                 optarr[idx].flag = 0p;
                                 optarr[idx].val  = options[i].short_name;
+                                optarr[idx].val  = options[i].val;
                                 if(    ((intptr_t)options[i].parse) == ((intptr_t)parse_settrue)
                                     || ((intptr_t)options[i].parse) == ((intptr_t)parse_setfalse) ) {
 …
                 optarr[idx+0].[name, has_arg, flag, val] = ["help", no_argument, 0, 'h'];
                 optarr[idx+1].[name, has_arg, flag, val] = [0, no_argument, 0, 0];
+        }
-        char optstring[opt_count * 3] = { '\0' };
+        {
-                int idx = 0;
-                for(i; opt_count) {
-                        optstring[idx] = options[i].short_name;
-                        idx++;
-                        if(    ((intptr_t)options[i].parse) != ((intptr_t)parse_settrue)
-                            && ((intptr_t)options[i].parse) != ((intptr_t)parse_setfalse) ) {
-                                optstring[idx] = ':';
-                                idx++;
+                        }
+                }
-                optstring[idx+0] = 'h';
-                optstring[idx+1] = '\0';
+        }
 …
                         default:
                                 for(i; opt_count) {
                                         if(opt == options[i].short_name) {
+                                        if(opt == options[i].val) {
                                                 const char * arg = optarg ? optarg : "";
                                                 if( arg[0] == '=' ) { arg++; }
 …
         if(hwidth <= 0) hwidth = max;
+        fprintf(out, "  -%c, --%-*s   %.*s\n", sn, width, ln, hwidth, help);
+        char sname[4] = { ' ', ' ', ' ', '\0' };
+        if(sn != '\0') {
+                sname[0] = '-';
+                sname[1] = sn;
+                sname[2] = ',';
+        }
+        fprintf(out, "  %s --%-*s   %.*s\n", sname, width, ln, hwidth, help);
         for() {
                 help += min(strlen(help), hwidth);

libcfa/src/parseargs.hfa

-              rb6a8b31
+              rd95969a
 struct cfa_option {
+      int val; // reserved
       char short_name;
       const char * long_name;
 …
 static inline void ?{}( cfa_option & this ) {}
 forall(dtype T | { bool parse(const char *, T & ); })
+forall(T & | { bool parse(const char *, T & ); })
 static inline void ?{}( cfa_option & this, char short_name, const char * long_name, const char * help, T & variable ) {
+      this.val        = 0;
       this.short_name = short_name;
       this.long_name  = long_name;
 …
+}
 forall(dtype T)
+forall(T &)
 static inline void ?{}( cfa_option & this, char short_name, const char * long_name, const char * help, T & variable, bool (*parse)(const char *, T & )) {
+      this.val        = 0;
       this.short_name = short_name;
       this.long_name  = long_name;

libcfa/src/rational.cfa

-              rb6a8b31
+              rd95969a
 #include "stdlib.hfa"
 forall( otype RationalImpl | arithmetic( RationalImpl ) ) {
+forall( RationalImpl | arithmetic( RationalImpl ) ) {
         // helper routines
 …
         // I/O
         forall( dtype istype | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
+        forall( istype & | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
         istype & ?|?( istype & is, Rational(RationalImpl) & r ) {
                 is | r.numerator | r.denominator;
 …
         } // ?|?
         forall( dtype ostype | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
+        forall( ostype & | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
                 ostype & ?|?( ostype & os, Rational(RationalImpl) r ) {
                         return os | r.numerator | '/' | r.denominator;
 …
 } // distribution
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { RationalImpl ?\?( RationalImpl, unsigned long ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { RationalImpl ?\?( RationalImpl, unsigned long ); } )
 Rational(RationalImpl) ?\?( Rational(RationalImpl) x, long int y ) {
         if ( y < 0 ) {
 …
 // conversion
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
 double widen( Rational(RationalImpl) r ) {
         return convert( r.numerator ) / convert( r.denominator );
 } // widen
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); RationalImpl convert( double ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); RationalImpl convert( double ); } )
 Rational(RationalImpl) narrow( double f, RationalImpl md ) {
         // http://www.ics.uci.edu/~eppstein/numth/frap.c

libcfa/src/rational.hfa

-              rb6a8b31
+              rd95969a
 #include "iostream.hfa"
 trait scalar( otype T ) {
+trait scalar( T ) {
 };
 trait arithmetic( otype T | scalar( T ) ) {
+trait arithmetic( T | scalar( T ) ) {
         int !?( T );
         int ?==?( T, T );
 …
 // implementation
 forall( otype RationalImpl | arithmetic( RationalImpl ) ) {
+forall( RationalImpl | arithmetic( RationalImpl ) ) {
         struct Rational {
                 RationalImpl numerator, denominator;                    // invariant: denominator > 0
 …
         // I/O
         forall( dtype istype | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
+        forall( istype & | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
         istype & ?|?( istype &, Rational(RationalImpl) & );
         forall( dtype ostype | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
+        forall( ostype & | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
                 ostype & ?|?( ostype &, Rational(RationalImpl) );
                 void ?|?( ostype &, Rational(RationalImpl) );
 …
 } // distribution
 forall( otype RationalImpl | arithmetic( RationalImpl ) |{RationalImpl ?\?( RationalImpl, unsigned long );} )
+forall( RationalImpl | arithmetic( RationalImpl ) |{RationalImpl ?\?( RationalImpl, unsigned long );} )
 Rational(RationalImpl) ?\?( Rational(RationalImpl) x, long int y );
 // conversion
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
 double widen( Rational(RationalImpl) r );
 forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl );  RationalImpl convert( double );} )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl );  RationalImpl convert( double );} )
 Rational(RationalImpl) narrow( double f, RationalImpl md );

libcfa/src/stdlib.cfa

-              rb6a8b31
+              rd95969a
 // Cforall allocation/deallocation and constructor/destructor, array types
 forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } )
+forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } )
 T * anew( size_t dim, TT p ) {
         T * arr = alloc( dim );
 …
 } // anew
 forall( dtype T | sized(T) | { void ^?{}( T & ); } )
+forall( T & | sized(T) | { void ^?{}( T & ); } )
 void adelete( T arr[] ) {
         if ( arr ) {                                                                            // ignore null
 …
 } // adelete
 forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype TT | { void adelete( TT ); } )
+forall( T & | sized(T) | { void ^?{}( T & ); }, TT... | { void adelete( TT ); } )
 void adelete( T arr[], TT rest ) {
         if ( arr ) {                                                                            // ignore null
 …
 //---------------------------------------
 forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
         E * bsearch( E key, const E * vals, size_t dim ) {
                 int cmp( const void * t1, const void * t2 ) {
 …
 forall( otype K, otype E | { int ?<?( K, K ); K getKey( const E & ); } ) {
+forall( K, E | { int ?<?( K, K ); K getKey( const E & ); } ) {
         E * bsearch( K key, const E * vals, size_t dim ) {
                 int cmp( const void * t1, const void * t2 ) {

libcfa/src/stdlib.hfa

-              rb6a8b31
+              rd95969a
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Dec 12 13:52:34 2020
 // Update Count     : 536
+// Last Modified On : Thu Jan 21 22:02:13 2021
+// Update Count     : 574
 //
 …
         else return (T *)alignment( _Alignof(T), dim, sizeof(T) )
 static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
         // CFA safe equivalents, i.e., implicit size specification
 …
 . Replace the current forall-block that contains defintions of S_fill and S_realloc with following:
                 forall( dtype T | sized(T) ) {
+                forall( T & | sized(T) ) {
                         union  U_fill           { char c; T * a; T t; };
                         struct S_fill           { char tag; U_fill(T) fill; };
 …
 typedef struct S_resize                 { inline void *;  }     T_resize;
 forall( dtype T ) {
+forall( T & ) {
         struct S_fill           { char tag; char c; size_t size; T * at; char t[50]; };
         struct S_realloc        { inline T *; };
 …
 static inline T_resize  ?`resize  ( void * a )  { return (T_resize){a}; }
 static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
         S_fill(T) ?`fill ( T t ) {
                 S_fill(T) ret = { 't' };
                 size_t size = sizeof(T);
+                if(size > sizeof(ret.t)) { printf("ERROR: const object of size greater than 50 bytes given for dynamic memory fill\n"); exit(1); }
+                if ( size > sizeof(ret.t) ) {
+                        abort( "ERROR: const object of size greater than 50 bytes given for dynamic memory fill\n" );
+                } // if
                 memcpy( &ret.t, &t, size );
                 return ret;
 …
         S_realloc(T)    ?`realloc ( T * a )                             { return (S_realloc(T)){a}; }
         T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill) {
+        T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill ) {
                 T * ptr = NULL;
                 size_t size = sizeof(T);
 …
                         ptr = (T*) (void *) resize( (void *)Resize, Align, Dim * size );
                 } else if ( Realloc ) {
                         if (Fill.tag != '0') copy_end = min(malloc_size( Realloc ), Dim * size);
                         ptr = (T*) (void *) realloc( (void *)Realloc, Align, Dim * size );
+                        if ( Fill.tag != '0' ) copy_end = min(malloc_size( Realloc ), Dim * size );
+                        ptr = (T *) (void *) realloc( (void *)Realloc, Align, Dim * size );
                 } else {
                         ptr = (T*) (void *) memalign( Align, Dim * size );
+                }
                 if(Fill.tag == 'c') {
+                        ptr = (T *) (void *) memalign( Align, Dim * size );
+                }
+                if ( Fill.tag == 'c' ) {
                         memset( (char *)ptr + copy_end, (int)Fill.c, Dim * size - copy_end );
                 } else if(Fill.tag == 't') {
+                } else if ( Fill.tag == 't' ) {
                         for ( int i = copy_end; i < Dim * size; i += size ) {
+                                #pragma GCC diagnostic push
+                                #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+                                assert( size <= sizeof(Fill.t) );
                                 memcpy( (char *)ptr + i, &Fill.t, size );
+                                #pragma GCC diagnostic pop
+                        }
                 } else if(Fill.tag == 'a') {
+                } else if ( Fill.tag == 'a' ) {
                         memcpy( (char *)ptr + copy_end, Fill.at, min(Dim * size - copy_end, Fill.size) );
+                } else if(Fill.tag == 'T') {
+                        for ( int i = copy_end; i < Dim * size; i += size ) {
+                                memcpy( (char *)ptr + i, Fill.at, size );
+                        }
+                } else if ( Fill.tag == 'T' ) {
+                        memcpy( (char *)ptr + copy_end, Fill.at, Dim * size );
+                }
 …
         } // $alloc_internal
         forall( ttype TT | { T * $alloc_internal( void *, T *, size_t, size_t, S_fill(T), TT ); } ) {
+        forall( TT... | { T * $alloc_internal( void *, T *, size_t, size_t, S_fill(T), TT ); } ) {
                 T * $alloc_internal( void *       , T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill, T_resize Resize, TT rest) {
 …
 } // distribution T
 static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
         // CFA safe initialization/copy, i.e., implicit size specification, non-array types
         T * memset( T * dest, char fill ) {
 …
 // CFA deallocation for multiple objects
 static inline forall( dtype T )                                                 // FIX ME, problems with 0p in list
+static inline forall( T & )                                                     // FIX ME, problems with 0p in list
 void free( T * ptr ) {
         free( (void *)ptr );                                                            // C free
 } // free
 static inline forall( dtype T, ttype TT | { void free( TT ); } )
+static inline forall( T &, TT... | { void free( TT ); } )
 void free( T * ptr, TT rest ) {
         free( ptr );
 …
 // CFA allocation/deallocation and constructor/destructor, non-array types
 static inline forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } )
+static inline forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } )
 T * new( TT p ) {
         return &(*(T *)malloc()){ p };                                                  // run constructor
+        return &(*(T *)malloc()){ p };                                          // run constructor
 } // new
 static inline forall( dtype T | { void ^?{}( T & ); } )
+static inline forall( T & | { void ^?{}( T & ); } )
 void delete( T * ptr ) {
         // special case for 0-sized object => always call destructor
 …
         free( ptr );                                                                            // always call free
 } // delete
 static inline forall( dtype T, ttype TT | { void ^?{}( T & ); void delete( TT ); } )
+static inline forall( T &, TT... | { void ^?{}( T & ); void delete( TT ); } )
 void delete( T * ptr, TT rest ) {
         delete( ptr );
 …
 // CFA allocation/deallocation and constructor/destructor, array types
 forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } ) T * anew( size_t dim, TT p );
 forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( T arr[] );
 forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype TT | { void adelete( TT ); } ) void adelete( T arr[], TT rest );
+forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } ) T * anew( size_t dim, TT p );
+forall( T & | sized(T) | { void ^?{}( T & ); } ) void adelete( T arr[] );
+forall( T & | sized(T) | { void ^?{}( T & ); }, TT... | { void adelete( TT ); } ) void adelete( T arr[], TT rest );
 //---------------------------------------
 …
 //---------------------------------------
 forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
         E * bsearch( E key, const E * vals, size_t dim );
         size_t bsearch( E key, const E * vals, size_t dim );
 …
 } // distribution
 forall( otype K, otype E | { int ?<?( K, K ); K getKey( const E & ); } ) {
+forall( K, E | { int ?<?( K, K ); K getKey( const E & ); } ) {
         E * bsearch( K key, const E * vals, size_t dim );
         size_t bsearch( K key, const E * vals, size_t dim );
 …
 } // distribution
 forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
         void qsort( E * vals, size_t dim );
 } // distribution

libcfa/src/time.cfa

-              rb6a8b31
+              rd95969a
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, Duration dur ) with( dur ) {
                 (ostype &)(os | tn / TIMEGRAN);                                 // print seconds
 …
 } // strftime
 forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
         ostype & ?|?( ostype & os, Time time ) with( time ) {
                 char buf[32];                                                                   // at least 26

libcfa/src/vec/vec.hfa

-              rb6a8b31
+              rd95969a
 #include <math.hfa>
 trait fromint(otype T) {
+trait fromint(T) {
     void ?{}(T&, int);
 };
 trait zeroinit(otype T) {
+trait zeroinit(T) {
     void ?{}(T&, zero_t);
 };
 trait zero_assign(otype T) {
+trait zero_assign(T) {
     T ?=?(T&, zero_t);
 };
 trait subtract(otype T) {
+trait subtract(T) {
     T ?-?(T, T);
 };
 trait negate(otype T) {
+trait negate(T) {
     T -?(T);
 };
 trait add(otype T) {
+trait add(T) {
     T ?+?(T, T);
 };
 trait multiply(otype T) {
+trait multiply(T) {
     T ?*?(T, T);
 };
 trait divide(otype T) {
+trait divide(T) {
     T ?/?(T, T);
 };
 trait lessthan(otype T) {
+trait lessthan(T) {
     int ?<?(T, T);
 };
 trait equality(otype T) {
+trait equality(T) {
     int ?==?(T, T);
 };
 trait sqrt(otype T) {
+trait sqrt(T) {
     T sqrt(T);
 };
 …
+}
 trait dottable(otype V, otype T) {
+trait dottable(V, T) {
     T dot(V, V);
 };
 …
 static inline {
 forall(otype T | sqrt(T), otype V | dottable(V, T))
+forall(T | sqrt(T), V | dottable(V, T))
 T length(V v) {
    return sqrt(dot(v, v));
+}
 forall(otype T, otype V | dottable(V, T))
+forall(T, V | dottable(V, T))
 T length_squared(V v) {
    return dot(v, v);
+}
 forall(otype T, otype V | { T length(V); } | subtract(V))
+forall(T, V | { T length(V); } | subtract(V))
 T distance(V v1, V v2) {
     return length(v1 - v2);
+}
 forall(otype T, otype V | { T length(V); V ?/?(V, T); })
+forall(T, V | { T length(V); V ?/?(V, T); })
 V normalize(V v) {
     return v / length(v);
 …
 // Project vector u onto vector v
 forall(otype T, otype V | dottable(V, T) | { V normalize(V); V ?*?(V, T); })
+forall(T, V | dottable(V, T) | { V normalize(V); V ?*?(V, T); })
 V project(V u, V v) {
     V v_norm = normalize(v);
 …
 // Reflect incident vector v with respect to surface with normal n
 forall(otype T | fromint(T), otype V | { V project(V, V); V ?*?(T, V); V ?-?(V,V); })
+forall(T | fromint(T), V | { V project(V, V); V ?*?(T, V); V ?-?(V,V); })
 V reflect(V v, V n) {
     return v - (T){2} * project(v, n);
 …
 // entering material (i.e., from air to water, eta = 1/1.33)
 // v and n must already be normalized
 forall(otype T | fromint(T) | subtract(T) | multiply(T) | add(T) | lessthan(T) | sqrt(T),
        otype V | dottable(V, T) | { V ?*?(T, V); V ?-?(V,V); void ?{}(V&, zero_t); })
+forall(T | fromint(T) | subtract(T) | multiply(T) | add(T) | lessthan(T) | sqrt(T),
+       V | dottable(V, T) | { V ?*?(T, V); V ?-?(V,V); void ?{}(V&, zero_t); })
 V refract(V v, V n, T eta) {
     T dotValue = dot(n, v);
 …
 // i is the incident vector
 // ng is the geometric normal of the surface
 forall(otype T | lessthan(T) | zeroinit(T), otype V | dottable(V, T) | negate(V))
+forall(T | lessthan(T) | zeroinit(T), V | dottable(V, T) | negate(V))
 V faceforward(V n, V i, V ng) {
     return dot(ng, i) < (T){0} ? n : -n;

libcfa/src/vec/vec2.hfa

-              rb6a8b31
+              rd95969a
 #include "vec.hfa"
 forall (otype T) {
+forall (T) {
     struct vec2 {
         T x, y;
 …
+}
 forall (otype T) {
+forall (T) {
     static inline {
 …
+}
 forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec2(T) v) with (v) {
         return os | '<' | x | ',' | y | '>';

libcfa/src/vec/vec3.hfa

-              rb6a8b31
+              rd95969a
 #include "vec.hfa"
 forall (otype T) {
+forall (T) {
     struct vec3 {
         T x, y, z;
 …
+}
 forall (otype T) {
+forall (T) {
     static inline {
 …
+}
 forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec3(T) v) with (v) {
         return os | '<' | x | ',' | y | ',' | z | '>';

libcfa/src/vec/vec4.hfa

-              rb6a8b31
+              rd95969a
 #include "vec.hfa"
 forall (otype T) {
+forall (T) {
     struct vec4 {
         T x, y, z, w;
 …
+}
 forall (otype T) {
+forall (T) {
     static inline {
 …
+}
 forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec4(T) v) with (v) {
         return os | '<' | x | ',' | y | ',' | z | ',' | w | '>';

src/Parser/parser.yy

-              rb6a8b31
+              rd95969a
 type_parameter:                                                                                 // CFA
         type_class identifier_or_type_name
+                { typedefTable.addToScope( *$2, TYPEDEFname, "9" ); }
+                {   typedefTable.addToScope( *$2, TYPEDEFname, "9" );
+                        if ( $1 == TypeDecl::Otype ) { SemanticError( yylloc, "otype keyword is deprecated" ); }
+                        if ( $1 == TypeDecl::Dtype ) { SemanticError( yylloc, "dtype keyword is deprecated" ); }
+                        if ( $1 == TypeDecl::Ttype ) { SemanticError( yylloc, "ttype keyword is deprecated" ); }
+                }
           type_initializer_opt assertion_list_opt
                 { $$ = DeclarationNode::newTypeParam( $1, $2 )->addTypeInitializer( $4 )->addAssertions( $5 ); }

src/ResolvExpr/PolyCost.cc

-              rb6a8b31
+              rd95969a
                 PassVisitor<PolyCost> coster( env, indexer );
                 type->accept( coster );
                 return coster.pass.result;
+                return (coster.pass.result > 0) ? 1 : 0;
+        }
 …
         ast::Pass<PolyCost_new> costing( symtab, env );
         type->accept( costing );
         return costing.core.result;
+        return (costing.core.result > 0) ? 1 : 0;
+}

src/ResolvExpr/SpecCost.cc

-              rb6a8b31
+              rd95969a
                 // mark specialization of base type
                 void postvisit(ReferenceType*) { if ( count >= 0 ) ++count; }
+                void postvisit(StructInstType*) { if ( count >= 0 ) ++count; }
+                void postvisit(UnionInstType*) { if ( count >= 0 ) ++count; }
         private:
 …
                 void previsit(StructInstType* sty) {
                         count = minover( sty->parameters );
-                        visit_children = false;
+                }
 …
                 void previsit(UnionInstType* uty) {
                         count = minover( uty->parameters );
-                        visit_children = false;
+                }
 …
                 void postvisit( const ast::ArrayType * ) { if ( count >= 0 ) ++count; }
                 void postvisit( const ast::ReferenceType * ) { if ( count >= 0 ) ++count; }
+                void postvisit( const ast::StructInstType * ) { if ( count >= 0 ) ++count; }
+                void postvisit( const ast::UnionInstType * ) { if ( count >= 0 ) ++count; }
                 // Use the minimal specialization value over returns and params.
 …
                 void previsit( const ast::StructInstType * sty ) {
                         count = minimumPresent( sty->params, expr_result );
-                        visit_children = false;
+                }
 …
                 void previsit( const ast::UnionInstType * uty ) {
                         count = minimumPresent( uty->params, expr_result );
-                        visit_children = false;
+                }

tests/alloc2.cfa

rb6a8b31	rd95969a
16	16	bool passed = (malloc_size(ip) == size) && (malloc_usable_size(ip) >= size) && (malloc_alignment(ip) == align) && ((uintptr_t)ip % align == 0);
17	17	if (!passed) {
18		printf("failed test %3d: %4~~lu %4lu but got %4lu ( %3lu ) %4l~~u\n", tests_total, size, align, malloc_size(ip), malloc_usable_size(ip), malloc_alignment(ip));
	18	printf("failed test %3d: %4zu %4zu but got %4zu ( %3zu ) %4zu\n", tests_total, size, align, malloc_size(ip), malloc_usable_size(ip), malloc_alignment(ip));
19	19	tests_failed += 1;
20	20	}

tests/avltree/avl-private.cfa

-              rb6a8b31
+              rd95969a
 // an AVL tree's height is easy to compute
 // just follow path with the larger balance
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int height(tree(K, V) * t){
   int helper(tree(K, V) * t, int ht){
 …
+}
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int calcBalance(tree(K, V) * t){
   int l = height(t->left);
 …
 // re-establish the link between parent and child
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void relinkToParent(tree(K, V) * t){
   tree(K, V) * parent = t->parent; // FIX ME!!
 …
 // rotate left from t
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * rotateLeft(tree(K, V) * t){
   tree(K, V) * newRoot = t->right;
 …
 // rotate right from t
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * rotateRight(tree(K, V) * t){
   tree(K, V) * newRoot = t->left;
 …
 // balances a node that has balance factor -2 or 2
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * fix(tree(K, V) * t){
   // ensure that t's balance factor is one of
 …
 // attempt to fix the tree, if necessary
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * tryFix(tree(K, V) * t){
   int b = calcBalance(t);
 …
 // sets parent field of c to be p
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void setParent(tree(K, V) * c, tree(K, V) * p){
   if (! empty(c)){

tests/avltree/avl-private.h

-              rb6a8b31
+              rd95969a
 // attempt to fix the tree, if necessary
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * tryFix(tree(K, V) * t);
 // sets parent field of c to be p
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void setParent(tree(K, V) * c, tree(K, V) * p);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int height(tree(K, V) * t);

tests/avltree/avl.h

-              rb6a8b31
+              rd95969a
 // #include <lib.h>
 trait Comparable(otype T) {
+trait Comparable(T) {
   int ?<?(T, T);
 };
 forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?==?(T t1, T t2);
 forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?>?(T t1, T t2);
 …
 // temporary: need forward decl to get around typedef problem
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 struct tree;
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 struct tree {
   K key;
 …
 };
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void ?{}(tree(K, V) &t, K key, V value);
 forall(otype K, otype V)
+forall(K | Comparable(K), V)
 void ^?{}(tree(K, V) & t);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * create(K key, V value);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 V * find(tree(K, V) * t, K key);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int empty(tree(K, V) * t);
 // returns the root of the tree
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int insert(tree(K, V) ** t, K key, V value);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int remove(tree(K, V) ** t, K key);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void copy(tree(K, V) * src, tree(K, V) ** ret);
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void for_each(tree(K, V) * t, void (*func)(V));

tests/avltree/avl0.cfa

-              rb6a8b31
+              rd95969a
 #include "avl.h"
 forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?==?(T t1, T t2) {
   return !(t1 < t2) && !(t2 < t1);
+}
 forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?>?(T t1, T t2) {
   return t2 < t1;

tests/avltree/avl1.cfa

-              rb6a8b31
+              rd95969a
 #include <stdlib.hfa>
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void ?{}(tree(K, V) &t, K key, V value){
   (t.key) { key };
 …
+}
 forall(otype K, otype V)
+forall(K| Comparable(K), V)
 void ^?{}(tree(K, V) & t){
   delete(t.left);
 …
+}
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * create(K key, V value) {
   // infinite loop trying to resolve ... t = malloc();

tests/avltree/avl2.cfa

-              rb6a8b31
+              rd95969a
 #include "avl-private.h"
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 V * find(tree(K, V) * t, K key){
   if (empty(t)){
 …
+}
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int empty(tree(K, V) * t){
   return t == NULL;
 …
 // returns the root of the tree
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int insert(tree(K, V) ** t, K key, V value) {
   // handles a non-empty tree

tests/avltree/avl3.cfa

-              rb6a8b31
+              rd95969a
 // swaps the data within two tree nodes
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void node_swap(tree(K, V) * t, tree(K, V) * t2){
         swap( t->key,  t2->key);
 …
 // go left as deep as possible from within the right subtree
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * find_successor(tree(K, V) * t){
         tree(K, V) * find_successor_helper(tree(K, V) * t){
 …
 // cleanup - don't want to deep delete, so set children to NULL first.
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void deleteSingleNode(tree(K, V) * t) {
         t->left = NULL;
 …
 // does the actual remove operation once we've found the node in question
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * remove_node(tree(K, V) * t){
         // is the node a leaf?
 …
 // finds the node that needs to be removed
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * remove_helper(tree(K, V) * t, K key, int * worked){
         if (empty(t)){
 …
+}
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int remove(tree(K, V) ** t, K key){
         int worked = 0;

tests/avltree/avl4.cfa

-              rb6a8b31
+              rd95969a
 // Perform a shallow copy of src, return the
 // new tree in ret
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int copy(tree(K, V) * src, tree(K, V) ** ret){
   tree(K, V) * helper(tree(K, V) * t, int * worked){
 …
 // Apply func to every value element in t, using an in order traversal
 forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void for_each(tree(K, V) * t, int (*func)(V)) {
   if (t == NULL) {

tests/bugs/10.cfa

rb6a8b31	rd95969a
2	2	// https://cforall.uwaterloo.ca/trac/ticket/10
3	3
4		forall(~~otype~~ T)
	4	forall(T)
5	5	struct result {
6	6	union {

tests/bugs/104.cfa

rb6a8b31	rd95969a
4	4	[ float, float ] modf_( float x );
5	5
6		forall(~~otype~~ T \| { [T, T] modf_(T); })
	6	forall(T \| { [T, T] modf_(T); })
7	7	void modf(T);
8	8

tests/bugs/194.cfa

-              rb6a8b31
+              rd95969a
 // https://cforall.uwaterloo.ca/trac/ticket/194
 forall( dtype T | sized(T) ) T * foo( void ) {
+forall( T & | sized(T) ) T * foo( void ) {
       printf( "foo1\n" );
         return (T *)0;
+}
 forall( dtype T | sized(T) ) T & foo( void ) {
+forall( T & | sized(T) ) T & foo( void ) {
         printf( "foo2\n" );
         return (T &)*(T *)0;

tests/bugs/196.cfa

-              rb6a8b31
+              rd95969a
 // https://cforall.uwaterloo.ca/trac/ticket/196
 forall(dtype T)
+forall(T &)
 struct link;
 forall(dtype T)
+forall(T &)
 struct link {
         link(T) * next;
 …
 // -----
 forall(dtype T)
+forall(T &)
 struct foo;
 forall(dtype U)
+forall(U &)
 struct bar {
         foo(U) * data;
 };
 forall(dtype T)
+forall(T &)
 struct foo {};

tests/bugs/203-2.cfa

-              rb6a8b31
+              rd95969a
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 forall(dtype A)
+forall(A &)
 struct empty {
         // Nothing.
 };
 forall(dtype C)
+forall(C &)
 struct wrap_e {
         empty(C) field;

tests/bugs/203-7.cfa

-              rb6a8b31
+              rd95969a
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 forall(dtype A)
+forall(A &)
 struct empty {
         // Nothing.
 };
 forall(dtype C)
+forall(C &)
 struct wrap_e {
         empty(C) field;

tests/bugs/203-9.cfa

-              rb6a8b31
+              rd95969a
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 forall(dtype A)
+forall(A &)
 struct empty {
         // Nothing.
 };
 forall(dtype C)
+forall(C &)
 struct wrap_e {
         empty(C) field;

tests/bugs/7.cfa

-              rb6a8b31
+              rd95969a
 // (Bug 1 unresolved as of this test.)
 forall(otype T)
+forall(T)
 struct stack_node;
 forall(otype T)
+forall(T)
 struct stack_node {
     stack_node(T) * next;
 …
 };
 forall(otype T)
+forall(T)
 struct stack {
     stack_node(T) * head;
 };
 trait stack_errors(otype T) {
+trait stack_errors(T) {
     T emptyStackHandler (stack(T) * this);
 };
 forall(otype T | stack_errors(T))
+forall(T | stack_errors(T))
 T pop (stack(T) * this) {
     return (T){};

tests/castError.cfa

rb6a8b31	rd95969a
14	14	//
15	15
16		forall(~~otype~~ T) struct S { T p; };
	16	forall(T) struct S { T p; };
17	17	int f;
18	18	S(int) sint;

tests/concurrent/examples/boundedBufferEXT.cfa

rb6a8b31	rd95969a
24	24	enum { BufferSize = 50 };
25	25
26		forall( ~~otype~~ T ) {
	26	forall( T ) {
27	27	monitor Buffer {
28	28	int front, back, count;

tests/concurrent/examples/boundedBufferINT.cfa

rb6a8b31	rd95969a
24	24	enum { BufferSize = 50 };
25	25
26		forall( ~~otype~~ T ) {
	26	forall( T ) {
27	27	monitor Buffer {
28	28	condition full, empty;

tests/concurrent/examples/quickSort.generic.cfa

rb6a8b31	rd95969a
21	21	#include <string.h> // strcmp
22	22
23		forall( ~~otype~~ T \| { int ?<?( T, T ); } ) {
	23	forall( T \| { int ?<?( T, T ); } ) {
24	24	thread Quicksort {
25	25	T * values; // communication variables

tests/concurrent/multi-monitor.cfa

rb6a8b31	rd95969a
38	38	}
39	39
40		forall(~~dtype T~~ \| sized(T) \| { void ^?{}(T & mutex); })
	40	forall(T & \| sized(T) \| { void ^?{}(T & mutex); })
41	41	void delete_mutex(T * x) {
42	42	^(*x){};

tests/concurrent/thread.cfa

rb6a8b31	rd95969a
1	1	#include <fstream.hfa>
2	2	#include <kernel.hfa>
	3	#include <locks.hfa>
3	4	#include <stdlib.hfa>
4	5	#include <thread.hfa>

tests/errors/completeType.cfa

-              rb6a8b31
+              rd95969a
 void foo(int *) {}
 void bar(void *) {}
 forall(otype T) void baz(T *);
 forall(dtype T) void qux(T *);
 forall(dtype T | sized(T)) void quux(T *);
+forall(T) void baz(T *);
+forall(T &) void qux(T *);
+forall(T & | sized(T)) void quux(T *);
 struct A;       // incomplete
 …
 forall(otype T)
+forall(T)
 void baz(T * x) {
         // okay
 …
+}
 forall(dtype T)
+forall(T &)
 void qux(T * y) {
         // okay
 …
+}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 void quux(T * z) {
         // okay

tests/exceptions/defaults.cfa

rb6a8b31	rd95969a
55	55
56	56	void unhandled_test(void) {
57		forall(~~dtype T, dtype V~~ \| is_exception(T, V))
	57	forall(T &, V & \| is_exception(T, V))
58	58	void defaultTerminationHandler(T &) {
59	59	throw (unhandled_exception){};

tests/exceptions/polymorphic.cfa

-              rb6a8b31
+              rd95969a
 #include <exception.hfa>
 FORALL_TRIVIAL_EXCEPTION(proxy, (otype T), (T));
 FORALL_TRIVIAL_INSTANCE(proxy, (otype U), (U))
+FORALL_TRIVIAL_EXCEPTION(proxy, (T), (T));
+FORALL_TRIVIAL_INSTANCE(proxy, (U), (U))
 const char * msg(proxy(int) * this) { return "proxy(int)"; }
 …
+}
 FORALL_DATA_EXCEPTION(cell, (otype T), (T))(
+FORALL_DATA_EXCEPTION(cell, (T), (T))(
         T data;
 );
 FORALL_DATA_INSTANCE(cell, (otype T), (T))
+FORALL_DATA_INSTANCE(cell, (T), (T))
 const char * msg(cell(int) * this) { return "cell(int)"; }

tests/exceptions/virtual-poly.cfa

-              rb6a8b31
+              rd95969a
 };
 forall(otype T)
+forall(T)
 struct mono_child_vtable {
         mono_base_vtable const * const parent;
 };
 forall(otype T)
+forall(T)
 struct mono_child {
         mono_child_vtable(T) const * virtual_table;
 …
+}
 forall(otype U)
+forall(U)
 struct poly_base_vtable {
         poly_base_vtable(U) const * const parent;
 };
 forall(otype U)
+forall(U)
 struct poly_base {
         poly_base_vtable(U) const * virtual_table;
 };
 forall(otype V)
+forall(V)
 struct poly_child_vtable {
         poly_base_vtable(V) const * const parent;
 };
 forall(otype V)
+forall(V)
 struct poly_child {
         poly_child_vtable(V) const * virtual_table;

tests/forall.cfa

-              rb6a8b31
+              rd95969a
 void g1() {
         forall( otype T ) T f( T ) {};
+        forall( T ) T f( T ) {};
         void f( int ) {};
         void h( void (*p)(void) ) {};
 …
 void g2() {
         forall( otype T ) void f( T, T ) {}
         forall( otype T, otype U ) void f( T, U ) {}
+        forall( T ) void f( T, T ) {}
+        forall( T, U ) void f( T, U ) {}
         int x;
 …
+}
 typedef forall ( otype T ) int (* f)( int );
 forall( otype T )
+typedef forall ( T ) int (* f)( int );
+forall( T )
 void swap( T left, T right ) {
         T temp = left;
 …
+}
 trait sumable( otype T ) {
+trait sumable( T ) {
         void ?{}( T &, zero_t );                                                        // 0 literal constructor
         T ?+?( T, T );                                                                          // assortment of additions
 …
 }; // sumable
 forall( otype T | sumable( T ) )                                                // use trait
+forall( T | sumable( T ) )                                              // use trait
 T sum( size_t size, T a[] ) {
         T total = 0;                                                                            // initialize by 0 constructor
 …
 } // sum
 forall( otype T | { T ?+?( T, T ); T ?++( T & ); [T] ?+=?( T &,T ); } )
+forall( T | { T ?+?( T, T ); T ?++( T & ); [T] ?+=?( T &,T ); } )
 T twice( T t ) {
         return t + t;
+}
 forall( otype T | { int ?<?(T, T); } )
+forall( T | { int ?<?(T, T); } )
 T min( T t1, T t2 ) {
         return t1 < t2 ? t1 : t2;
 …
 // Multiple forall
 forall( otype T ) forall( otype S ) struct { int i; };
 forall( otype T ) struct { int i; } forall( otype S );
 struct { int i; } forall( otype T ) forall( otype S );
 forall( otype W ) struct { int i; } forall( otype T ) forall( otype S );
+forall( T ) forall( S ) struct { int i; };
+forall( T ) struct { int i; } forall( S );
+struct { int i; } forall( T ) forall( S );
+forall( W ) struct { int i; } forall( T ) forall( S );
 // Distribution
 struct P { int i; };
 forall( otype T ) struct Q { T i; };
 forall( otype T ) struct { int i; };
+forall( T ) struct Q { T i; };
+forall( T ) struct { int i; };
 struct KK { int i; };
 inline static {
         void RT1() {}
+}
 forall( otype T ) {
+forall( T ) {
         T RT2( T ) {
                 typedef int TD1;
                 struct S1 { T t; };
+        }
         forall( otype X ) {
+        forall( X ) {
                 typedef int TD2;
                 struct S2 {};
 …
+        }
         extern "C" {
                 forall( otype W ) {
+                forall( W ) {
                         W RT3( W ) {}
                         struct S3 {};
 …
+        }
         void RT4() {
                 forall( otype W ) struct S4 {};
+                forall( W ) struct S4 {};
                 typedef int TD3;
+        }
 …
 static inline {
         forall( otype T ) {
+        forall( T ) {
                 int RT6( T p );
+        }
         forall( otype T, otype U ) {
+        forall( T, U ) {
                 int RT7( T, U );
+        }
+}
 static forall( otype T ) {
+static forall( T ) {
         int RT8( T );
+}
 forall( otype T ) inline static {
+forall( T ) inline static {
         int RT9( T ) { T t; return 3; }
+}
 forall( otype T | { T ?+?( T, T ); } ) {
         forall( otype S | { T ?+?( T, S ); } ) {
                 forall( otype W ) T bar( T t, S s ) { return t + s; }
                 forall( otype W | { W ?+?( T, W ); } ) W baz( T t, S s, W w ) { return t + s + w; }
+forall( T | { T ?+?( T, T ); } ) {
+        forall( S | { T ?+?( T, S ); } ) {
+                forall( W ) T bar( T t, S s ) { return t + s; }
+                forall( W | { W ?+?( T, W ); } ) W baz( T t, S s, W w ) { return t + s + w; }
                 struct W { T t; } (int,int) ww;
                 struct P pp;
 …
+}
 forall( otype T | { T ?+?( T, T ); } ) forall( otype S | { T ?+?( T, S ); } )
+forall( T | { T ?+?( T, T ); } ) forall( S | { T ?+?( T, S ); } )
 struct XW { T t; };
 XW(int,int) xww;
 forall( otype T ) struct S { T t; } (int) x, y, z;
 forall( otype T ) struct { T t; } (int) a, b, c;
 forall( otype T ) static forall( otype S ) {
     forall( otype X ) struct U {
+forall( T ) struct S { T t; } (int) x, y, z;
+forall( T ) struct { T t; } (int) a, b, c;
+forall( T ) static forall( S ) {
+    forall( X ) struct U {
                 T x;
     };
+}
 forall( otype T ) {
+forall( T ) {
         extern "C" {
                 struct SS { T t; };

tests/function-operator.cfa

-              rb6a8b31
+              rd95969a
 // STL-like Algorithms
 trait Assignable(dtype T, dtype U) { T ?=?(T &, U); };
 trait Copyable(dtype T) { void ?{}(T &, T); };
 trait Destructable(dtype T) { void ^?{}(T &); };
+trait Assignable(T &, U &) { T ?=?(T &, U); };
+trait Copyable(T &) { void ?{}(T &, T); };
+trait Destructable(T &) { void ^?{}(T &); };
 trait Iterator(dtype iter | sized(iter) | Copyable(iter) | Destructable(iter), otype T) {
+trait Iterator(iter & | sized(iter) | Copyable(iter) | Destructable(iter), T) {
         T & *?(iter);
         iter ++?(iter &);
 …
 };
 forall(otype Tin, dtype Input | Iterator(Input, Tin), otype Tout, dtype Output | Iterator(Output, Tout) | Assignable(Tout, Tin))
+forall(Tin, Input & | Iterator(Input, Tin), Tout, Output & | Iterator(Output, Tout) | Assignable(Tout, Tin))
 Output copy(Input first, Input last, Output result) {
         while (first != last) {
 …
 // test ?()(T *, ...) -- ?() with function call-by-pointer
 forall(otype Tin, dtype Input | Iterator(Input, Tin), otype Tout, dtype Output | Iterator(Output, Tout), otype FuncRet, dtype Func | { FuncRet ?()(Func *, Tin); } | Assignable(Tout, FuncRet))
+forall(Tin, Input & | Iterator(Input, Tin), Tout, Output & | Iterator(Output, Tout), FuncRet, Func & | { FuncRet ?()(Func *, Tin); } | Assignable(Tout, FuncRet))
 Output transform (Input first, Input last, Output result, Func * op) {
         while (first != last) {
 …
 // test ?()(T, ...) -- ?() with function call-by-value
 forall(dtype Iter, otype T | Iterator(Iter, T), otype Pred | { int ?()(Pred, T); })
+forall(Iter &, T | Iterator(Iter, T), Pred | { int ?()(Pred, T); })
 Iter find_if (Iter first, Iter last, Pred pred) {
         while (first != last) {
 …
 // test ?()(T, ...) -- ?() with function call-by-reference
 forall(otype Generator, otype GenRet | { GenRet ?()(Generator &); }, dtype Iter, otype T | Iterator(Iter, T) | Assignable(T, GenRet))
+forall(Generator, GenRet | { GenRet ?()(Generator &); }, Iter &, T | Iterator(Iter, T) | Assignable(T, GenRet))
 void generate(Iter first, Iter last, Generator & gen) {
         int i = 0;
 …
+}
 forall(otype T | { int ?==?(T, T); })
+forall(T | { int ?==?(T, T); })
 struct Equals {
         T val;
 };
 forall(otype T | { int ?==?(T, T); })
+forall(T | { int ?==?(T, T); })
 int ?()(Equals(T) eq, T x) {
         return eq.val == x;
+}
 forall(otype T | { T ?*?(T, T); })
+forall(T | { T ?*?(T, T); })
 struct Multiply {
         T val;
 };
 forall(otype T | { T ?*?(T, T); })
+forall(T | { T ?*?(T, T); })
 T ?()(Multiply(T) * mult, T x) {
         return mult->val * x;
 …
 // TODO: generalize to ttype return; doesn't work yet
 // like std::function
 forall(otype Return, ttype Args)
+forall(Return, Args...)
 struct function {
         Return (*f)(Args);

tests/genericUnion.cfa

-              rb6a8b31
+              rd95969a
 #include <limits.hfa>
 forall(otype T)
+forall(T)
 union ByteView {
         T val;
 …
 };
 forall(otype T)
+forall(T)
 void print(ByteView(T) x) {
         for (int i = 0; i < sizeof(int); i++) {                         // want to change to sizeof(T)
 …
+}
 forall(otype T)
+forall(T)
 void f(ByteView(T) x, T val) {
         print(x);

tests/global-monomorph.cfa

-              rb6a8b31
+              rd95969a
 // Create monomorphic instances of polymorphic types at global scope.
 forall(dtype T)
+forall(T &)
 void poly0(T &) {}
 forall(dtype T | sized(T))
+forall(T & | sized(T))
 void poly1(T &) {}
 forall(otype T)
+forall(T)
 void poly2(T &) {}

tests/identity.cfa

rb6a8b31	rd95969a
16	16	#include <fstream.hfa>
17	17
18		forall( ~~otype~~ T )
	18	forall( T )
19	19	T identity( T t ) {
20	20	return t;

tests/init1.cfa

-              rb6a8b31
+              rd95969a
+}
 forall (dtype T, dtype S)
+forall (T &, S &)
 T & anycvt( S & s ) {
     return s;               // mismatched referenced type
+}
 forall (dtype T, dtype S)
+forall (T &, S &)
 T * anycvt( S * s ) {
     return s;               // mismatched referenced type

tests/nested-types.cfa

rb6a8b31	rd95969a
16	16	typedef int N;
17	17	struct A {
18		forall(~~otype~~ T)
	18	forall(T)
19	19	struct N {
20	20	T x;

tests/poly-d-cycle.cfa

-              rb6a8b31
+              rd95969a
 // Check that a cycle of polymorphic dtype structures can be instancated.
 forall(dtype T)
+forall(T &)
 struct func_table;
 forall(dtype U)
+forall(U &)
 struct object {
         func_table(U) * virtual_table;
 };
 forall(dtype T)
+forall(T &)
 struct func_table {
         void (*object_func)(object(T) *);

tests/poly-o-cycle.cfa

-              rb6a8b31
+              rd95969a
 // Check that a cycle of polymorphic otype structures can be instancated.
 forall(otype T)
+forall(T)
 struct func_table;
 forall(otype U)
+forall(U)
 struct object {
         func_table(U) * virtual_table;
 };
 forall(otype T)
+forall(T)
 struct func_table {
         void (*object_func)(object(T) *);

tests/polymorphism.cfa

-              rb6a8b31
+              rd95969a
 #include <fstream.hfa>
 forall(otype T)
+forall(T)
 T f(T x, T y) {
         x = y;
 …
+}
 forall(otype T) T ident(T x) {
+forall(T) T ident(T x) {
         return x;
+}
 forall( otype T, otype U )
+forall( T, U )
 size_t struct_size( T i, U j ) {
         struct S { T i; U j; };
 …
+}
 forall( otype T, otype U )
+forall( T, U )
 size_t union_size( T i, U j ) {
         union B { T i; U j; };
 …
 // perform some simple operations on aggregates of T and U
 forall( otype T | { void print(T); int ?==?(T, T); }, otype U | { void print(U); U ?=?(U&, zero_t); } )
+forall( T | { void print(T); int ?==?(T, T); }, U | { void print(U); U ?=?(U&, zero_t); } )
 U foo(T i, U j) {
         struct S { T i; U j; };

tests/raii/ctor-autogen.cfa

-              rb6a8b31
+              rd95969a
 // dtype-static generic type is otype
 forall(dtype T)
+forall(T &)
 struct DtypeStaticStruct {
   T * data;
 …
 };
 forall(dtype T)
+forall(T &)
 union DtypeStaticUnion {
   T * data;
 …
 // dynamic generic type is otype
 forall(otype T)
+forall(T)
 struct DynamicStruct {
         T x;
 };
 forall(otype T)
+forall(T)
 union DynamicUnion {
         T x;
 …
 forall(otype T)
+forall(T)
 T identity(T x) { return x; }

tests/simpleGenericTriple.cfa

-              rb6a8b31
+              rd95969a
 //
 forall(otype T)
+forall(T)
 struct T3 {
         T f0, f1, f2;
 };
 forall(otype T | { T ?+?(T, T); })
+forall(T | { T ?+?(T, T); })
 T3(T) ?+?(T3(T) x, T3(T) y) {
         T3(T) z = { x.f0+y.f0, x.f1+y.f1, x.f2+y.f2 };

tests/sum.cfa

-              rb6a8b31
+              rd95969a
 #include <stdlib.hfa>
 trait sumable( otype T ) {
+trait sumable( T ) {
         void ?{}( T &, zero_t );                                                        // 0 literal constructor
         T ?+?( T, T );                                                                          // assortment of additions
 …
 }; // sumable
 forall( otype T | sumable( T ) )                                                // use trait
+forall( T | sumable( T ) )                                              // use trait
 T sum( size_t size, T a[] ) {
         T total = 0;                                                                            // initialize by 0 constructor
 …
                  | sum( size, (S *)a ) | ", check" | (S)s;
         forall( otype Impl | sumable( Impl ) )
+        forall( Impl | sumable( Impl ) )
         struct GS {
                 Impl * x, * y;
 …
                  sum( size, (S *)a ).[i, j], s.[i, j] );
         forall( otype Impl | sumable( Impl ) )
+        forall( Impl | sumable( Impl ) )
         struct GS {
                 Impl * x, * y;

tests/tuple/tuplePolymorphism.cfa

-              rb6a8b31
+              rd95969a
 // ensure that f is a viable candidate for g, even though its parameter structure does not exactly match
 [A] f([A, B] x, B y) { printf("%g %c %g %lld %c %lld %lld %c %lld\n", x.0.[x,y,z], x.1.[x,y,z], y.[x,y,z]); return x.0; }
 forall(otype T, otype U | { T f(T, U, U); })
+forall(T, U | { T f(T, U, U); })
 void g(T x, U y) { f(x, y, y); }
 // add two triples
 forall(otype T | { T ?+?(T, T); })
+forall(T | { T ?+?(T, T); })
 [T, T, T] ?+?([T, T, T] x, [T, T, T] y) {
         return [x.0+y.0, x.1+y.1, x.2+y.2];
 …
+}
 forall(otype T)
+forall(T)
 [T, T] foo([T, T] y) {
         [T, T] x;

tests/tuple/tupleVariadic.cfa

-              rb6a8b31
+              rd95969a
         printf("called func(void)\n");
+}
 forall(otype T, ttype Params | { void process(T); void func(Params); })
+forall(T, Params... | { void process(T); void func(Params); })
 void func(T arg1, Params p) {
         process(arg1);
 …
+}
 forall(otype T)
+forall(T)
 T * copy(T x) {
         // test calling new inside a polymorphic function
 …
+}
 forall(ttype T | { void foo(T); }) void bar(T x) {}
+forall(T... | { void foo(T); }) void bar(T x) {}
 void foo(int) {}

tests/zombies/ArrayN.c

rb6a8b31	rd95969a
6	6	// }
7	7
8		forall(~~otype~~ index_t)
	8	forall(index_t)
9	9	index_t offset_to_index(unsigned offset, index_t size) {
10	10	return [offset / size.0, offset % size.1];

tests/zombies/Members.c

-              rb6a8b31
+              rd95969a
 int ?=?( int*, int );
 float ?=?( float*, float );
 forall( dtype DT ) DT * ?=?( DT**, DT* );
 forall(otype T) lvalue T *?( T* );
+forall( DT & ) DT * ?=?( DT**, DT* );
+forall(T) lvalue T *?( T* );
 char *__builtin_memcpy();

tests/zombies/Rank2.c

-              rb6a8b31
+              rd95969a
 int ?=?( int &, int );
 forall(dtype DT) DT * ?=?( DT *&, DT * );
+forall(DT &) DT * ?=?( DT *&, DT * );
 void a() {
         forall( otype T ) void f( T );
         void g( forall( otype U ) void p( U ) );
+        forall( T ) void f( T );
+        void g( forall( U ) void p( U ) );
         g( f );
+}
 …
 void g() {
         void h( int *null );
         forall( otype T ) T id( T );
+        forall( T ) T id( T );
 //      forall( dtype T ) T *0;
 //      int 0;

tests/zombies/abstype.c

-              rb6a8b31
+              rd95969a
+}
 forall( otype T ) T *?( T * );
+forall( T ) T *?( T * );
 int ?++( int * );
 int ?=?( int *, int );
 forall( dtype DT ) DT * ?=?( DT **, DT * );
+forall( DT & ) DT * ?=?( DT **, DT * );
 otype U = int *;

tests/zombies/context.cfa

-              rb6a8b31
+              rd95969a
 // trait declaration
 trait has_q( otype T ) {
+trait has_q( T ) {
         T q( T );
 };
 forall( otype z | has_q( z ) ) void f() {
         trait has_r( otype T, otype U ) {
+forall( z | has_q( z ) ) void f() {
+        trait has_r( T, U ) {
                 T r( T, T (T,U) );
         };

tests/zombies/gc_no_raii/bug-repro/blockers/explicit_cast.c

-              rb6a8b31
+              rd95969a
 };
 forall(otype T)
+forall(T)
 struct gcpointer
+{
 …
 };
 forall(otype T)
+forall(T)
 static inline gcpointer(T) gcmalloc()
+{

tests/zombies/gc_no_raii/bug-repro/blockers/recursive_realloc.c

-              rb6a8b31
+              rd95969a
 #include <stdlib.hfa>
 trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
+{
         void realloc(allocator_t* const, size_t);
 };
 forall(otype T)
+forall(T)
 struct heap_allocator
+{
 …
 };
 forall(otype T)
+forall(T)
 inline void realloc(heap_allocator(T) *const this, size_t size)
+{

tests/zombies/gc_no_raii/bug-repro/deref.c

-              rb6a8b31
+              rd95969a
     forall(otype T)
+    forall(T)
     struct wrap
+    {
 …
     };
     forall(otype T)
+    forall(T)
     T *? (wrap(T) rhs)
+    {

tests/zombies/gc_no_raii/bug-repro/field.c

-              rb6a8b31
+              rd95969a
 //------------------------------------------------------------------------------
 //Declaration
 trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
+{
         void ctor(allocator_t* const);
 …
 };
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 struct vector
+{

tests/zombies/gc_no_raii/bug-repro/malloc.c

rb6a8b31	rd95969a
1		forall(~~otype~~ T)
	1	forall(T)
2	2	struct wrapper
3	3	{
…	…
5	5	};
6	6
7		forall(~~otype~~ T)
	7	forall(T)
8	8	void ctor(wrapper(T)* this)
9	9	{
…	…
11	11	}
12	12
13		forall(~~otype~~ T)
	13	forall(T)
14	14	wrapper(T) gcmalloc()
15	15	{
…	…
19	19	}
20	20
21		forall(~~otype~~ T)
	21	forall(T)
22	22	wrapper(T)* ?=? (wrapper(T)* lhs, wrapper(T)* rhs)
23	23	{

tests/zombies/gc_no_raii/bug-repro/oddtype.c

-              rb6a8b31
+              rd95969a
 forall(dtype T)
+forall(T &)
 struct wrap {
         int i;
 };
 forall(otype T) void ?{}(wrap(T)* this) {}
 forall(otype T) void ?=?(wrap(T)* this) {}
 forall(otype T) void ^?{}(wrap(T)* this) {}
+forall(T) void ?{}(wrap(T)* this) {}
+forall(T) void ?=?(wrap(T)* this) {}
+forall(T) void ^?{}(wrap(T)* this) {}
 struct List_t {

tests/zombies/gc_no_raii/bug-repro/push_back.h

-              rb6a8b31
+              rd95969a
 //------------------------------------------------------------------------------
 //Declaration
 trait allocator_c(otype T, otype allocator_t) {
+trait allocator_c(T, allocator_t) {
         void ctor(allocator_t* const);
         void dtor(allocator_t* const);
 …
 };
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 struct vector
+{
 …
 //------------------------------------------------------------------------------
 //Initialization
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void vector_ctor(vector(T, allocator_t) *const this);
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void dtor(vector(T, allocator_t) *const this);
 //------------------------------------------------------------------------------
 //Allocator
 forall(otype T)
+forall(T)
 struct heap_allocator
+{
 …
 };
 forall(otype T)
+forall(T)
 void ctor(heap_allocator(T) *const this);
 forall(otype T)
+forall(T)
 void dtor(heap_allocator(T) *const this);
 forall(otype T)
+forall(T)
 void realloc(heap_allocator(T) *const this, size_t size);
 forall(otype T)
+forall(T)
 inline T* data(heap_allocator(T) *const this)
+{
 …
 //------------------------------------------------------------------------------
 //Capacity
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline bool empty(vector(T, allocator_t) *const this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline bool size(vector(T, allocator_t) *const this)
+{
 …
+}
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline void reserve(vector(T, allocator_t) *const this, size_t size)
+{
 …
 //------------------------------------------------------------------------------
 //Modifiers
 forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void push_back(vector(T, allocator_t) *const this, T value);

tests/zombies/gc_no_raii/bug-repro/realloc.c

-              rb6a8b31
+              rd95969a
 void* realloc(void*, unsigned long int);
 forall(otype T)
+forall(T)
 struct wrap
+{
 …
 };
 forall(otype T)
+forall(T)
 static inline void realloc(wrap(T) *const this, unsigned long int size)
+{

tests/zombies/gc_no_raii/bug-repro/return.c

rb6a8b31	rd95969a
1		forall(~~otype~~ T)
	1	forall(T)
2	2	struct wrapper
3	3	{
…	…
5	5	};
6	6
7		forall(~~otype~~ T)
	7	forall(T)
8	8	wrapper(T) create()
9	9	{
…	…
12	12	}
13	13
14		forall(~~otype~~ T)
	14	forall(T)
15	15	wrapper(T)* ?=?(wrapper(T)* lhs, wrapper(T)* rhs)
16	16	{

tests/zombies/gc_no_raii/bug-repro/return_template.c

-              rb6a8b31
+              rd95969a
 forall(otype T)
+forall(T)
 struct wrap
+{
 …
 };
 forall(otype T) void ?{}(wrap(T)* this);
 forall(otype T) void ?{}(wrap(T)* this, wrap(T)* rhs);
 forall(otype T) void ^?{}(wrap(T)* this);
 forall(otype T) void ?=?(wrap(T)* this, wrap(T)* rhs);
+forall(T) void ?{}(wrap(T)* this);
+forall(T) void ?{}(wrap(T)* this, wrap(T)* rhs);
+forall(T) void ^?{}(wrap(T)* this);
+forall(T) void ?=?(wrap(T)* this, wrap(T)* rhs);
 forall(otype T)
+forall(T)
 wrap(T) test()
+{

tests/zombies/gc_no_raii/bug-repro/slow_malloc.c

rb6a8b31	rd95969a
1	1	#include <stdlib.hfa>
2	2
3		forall(~~otype~~ T)
	3	forall(T)
4	4	struct heap_allocator
5	5	{

tests/zombies/gc_no_raii/bug-repro/zero.c

-              rb6a8b31
+              rd95969a
 forall(otype T)
+forall(T)
 struct wrap
+{
 …
 };
 forall(otype T)
+forall(T)
 int ?==? (wrap(T) lhs, wrap(T) rhs)
+{
 …
 struct wrap(int) 0;
 /*/
 forall(otype T)
+forall(T)
 struct wrap(T) 0;
 //*/

tests/zombies/gc_no_raii/src/gc.h

rb6a8b31	rd95969a
13	13	// }
14	14
15		forall(~~otype~~ T)
	15	forall(T)
16	16	static inline void gcmalloc(gcpointer(T)* ptr)
17	17	{

tests/zombies/gc_no_raii/src/gcpointers.c

-              rb6a8b31
+              rd95969a
 #endif
 forall(otype T) void ?{}(gcpointer(T)* this) {
+forall(T) void ?{}(gcpointer(T)* this) {
         (&this->internal) {};
+}
 forall(otype T) void ?{}(gcpointer(T)* this, void* address) {
+forall(T) void ?{}(gcpointer(T)* this, void* address) {
         (&this->internal) { address };
+}
 forall(otype T) void ?{}(gcpointer(T)* this, gcpointer(T) other) {
+forall(T) void ?{}(gcpointer(T)* this, gcpointer(T) other) {
         (&this->internal) { other.internal };
+}
 forall(otype T) void ^?{}(gcpointer(T)* this) {
+forall(T) void ^?{}(gcpointer(T)* this) {
         ^?{}(&this->internal);
+}
 forall(otype T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs) {
+forall(T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs) {
         this->internal = rhs.internal;
         return *this;
 …
 // forall(otype T) T *?(gcpointer(T) this);
 forall(otype T) T* get(gcpointer(T)* this) {
+forall(T) T* get(gcpointer(T)* this) {
         return (T*)this->internal.ptr;
+}
 //
 // //Logical operators
 forall(otype T) int ?!=?(gcpointer(T) this, int zero) {
+forall(T) int ?!=?(gcpointer(T) this, int zero) {
         return this.internal.ptr != 0;
+}

tests/zombies/gc_no_raii/src/gcpointers.h

-              rb6a8b31
+              rd95969a
 #include <stdint.h>
 forall(dtype T)
+forall(T &)
 struct gcpointer;
 …
 #endif
 forall(dtype T)
+forall(T &)
 struct gcpointer
+{
 …
 //
 forall(otype T) void ?{}(gcpointer(T)* this);
 forall(otype T) void ?{}(gcpointer(T)* this, void* address);
 forall(otype T) void ?{}(gcpointer(T)* this, gcpointer(T) other);
 forall(otype T) void ^?{}(gcpointer(T)* this);
 forall(otype T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs);
+forall(T) void ?{}(gcpointer(T)* this);
+forall(T) void ?{}(gcpointer(T)* this, void* address);
+forall(T) void ?{}(gcpointer(T)* this, gcpointer(T) other);
+forall(T) void ^?{}(gcpointer(T)* this);
+forall(T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs);
 // forall(otype T) T *?(gcpointer(T) this);
 forall(otype T) T* get(gcpointer(T)* this);
+forall(T) T* get(gcpointer(T)* this);
 //Logical operators
 forall(otype T) int ?!=?(gcpointer(T) this, int zero);
 forall(otype T) int ?!=?(gcpointer(T) this, gcpointer(T) rhs);
 forall(otype T) int ?==?(gcpointer(T) this, gcpointer(T) rhs);
+forall(T) int ?!=?(gcpointer(T) this, int zero);
+forall(T) int ?!=?(gcpointer(T) this, gcpointer(T) rhs);
+forall(T) int ?==?(gcpointer(T) this, gcpointer(T) rhs);

tests/zombies/gc_no_raii/src/tools.h

-              rb6a8b31
+              rd95969a
 // }
 trait has_equal(otype T)
+trait has_equal(T)
+{
         signed int ?==?(T a, T b);
 };
 trait InputIterator_t(otype T, otype InputIterator)
+trait InputIterator_t(T, InputIterator)
+{
         signed int ?==?(InputIterator a, InputIterator b);
 …
 };
 forall(otype T | has_equal(T), otype InputIterator | InputIterator_t(T, InputIterator))
+forall(T | has_equal(T), InputIterator | InputIterator_t(T, InputIterator))
 static inline InputIterator find( InputIterator first, const InputIterator* const last, T val)
+{

tests/zombies/hashtable.cfa

-              rb6a8b31
+              rd95969a
 trait has_hash( otype K ) {
+trait has_hash( K ) {
     size_t hash(K);
     int ?==?( K, K );
 };
 trait hkey( otype K, dtype tN | has_hash(K) ) {
+trait hkey( K, tN & | has_hash(K) ) {
     K key(tN &);
 };
 forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) ) {
     struct hashtable {
 …
+}
 forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
     void ?{}( hashtable(K, tN, tE) & this, size_t n_buckets, dlist(tN, tE) *buckets ) {
 …
+}
 forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) ) {
     float fill_frac( hashtable(K, tN, tE) & this ) with(this) {
 …
 trait heaped(dtype T) {
+trait heaped(T &) {
     T * alloc( size_t );
     void free( void * );
 …
+}
 forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) | heaped( dlist(tN, tE) ) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) | heaped( dlist(tN, tE) ) ) {
     struct hashtable_dynamic {

tests/zombies/hashtable2.cfa

-              rb6a8b31
+              rd95969a
 trait pretendsToMatter( dtype TTT ) {
+trait pretendsToMatter( TTT & ) {
     void actsmart(TTT &);
 };
 forall( dtype TTTx )
+forall( TTTx & )
 void actsmart(TTTx &) {}
 …
 //   2. shows up in -CFA output as hashtable_rbs(), which is bad C; expecting hashtable_rbs*
 forall( otype Tt_unused | pretendsToMatter(Tt_unused) ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) ) {
     // hashtable of request by source
 …
+}
 forall( otype Tt_unused | pretendsToMatter(Tt_unused) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
     void ?{}( hashtable_rbs(Tt_unused) & this, size_t n_buckets, dlist(request_in_ht_by_src, request) *buckets,
 …
 void defaultResumptionHandler( ht_auto_resize_pending & ex );
 forall( otype Tt_unused | pretendsToMatter(Tt_unused) ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) ) {
     float fill_frac( hashtable_rbs(Tt_unused) & this ) with(this) {
 …
 trait heaped(dtype T) {
+trait heaped(T &) {
     T * alloc( size_t );
     void free( void * );
 …
 void __dynamic_defaultResumptionHandler(ht_fill_limit_crossed &);
 forall( otype Tt_unused ) {
+forall( Tt_unused ) {
     struct hashtable_rbs_dynamic {
 …
 forall( otype Tt_unused | heaped( dlist(request_in_ht_by_src, request) ) ) {
+forall( Tt_unused | heaped( dlist(request_in_ht_by_src, request) ) ) {
     void ?{}( hashtable_rbs_dynamic(Tt_unused).resize_policy & this, size_t nbuckets_floor ) {
 …
+}
 forall( otype Tt_unused ) {
+forall( Tt_unused ) {
     void rehashToLarger_STEP( hashtable_rbs_dynamic(Tt_unused) & this, size_t new_n_buckets ) with (this) {
         rehashToLarger( this, new_n_buckets );

tests/zombies/huge.c

rb6a8b31	rd95969a
14	14	//
15	15
16		int huge( int n, forall( ~~otype~~ T ) T (*f)( T ) ) {
	16	int huge( int n, forall( T ) T (*f)( T ) ) {
17	17	if ( n <= 0 )
18	18	return f( 0 );

tests/zombies/it_out.c

-              rb6a8b31
+              rd95969a
 typedef unsigned long streamsize_type;
 trait ostream( dtype os_type ) {
+trait ostream( os_type & ) {
         os_type *write( os_type *, const char *, streamsize_type );
         int fail( os_type * );
 };
 trait writeable( otype T ) {
         forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, T );
+trait writeable( T ) {
+        forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, T );
 };
 forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, char );
 forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, int );
 forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, const char * );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, char );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, int );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, const char * );
 trait istream( dtype is_type ) {
+trait istream( is_type & ) {
         is_type *read( is_type *, char *, streamsize_type );
         is_type *unread( is_type *, char );
 …
 };
 trait readable( otype T ) {
         forall( dtype is_type | istream( is_type ) ) is_type * ?<<?( is_type *, T );
+trait readable( T ) {
+        forall( is_type & | istream( is_type ) ) is_type * ?<<?( is_type *, T );
 };
 forall( dtype is_type | istream( is_type ) ) is_type * ?>>?( is_type *, char* );
 forall( dtype is_type | istream( is_type ) ) is_type * ?>>?( is_type *, int* );
+forall( is_type & | istream( is_type ) ) is_type * ?>>?( is_type *, char* );
+forall( is_type & | istream( is_type ) ) is_type * ?>>?( is_type *, int* );
 trait iterator( otype iterator_type, otype elt_type ) {
+trait iterator( iterator_type, elt_type ) {
         iterator_type ?++( iterator_type* );
         iterator_type ++?( iterator_type* );
 …
 };
 forall( otype elt_type | writeable( elt_type ),
                 otype iterator_type | iterator( iterator_type, elt_type ),
                 dtype os_type | ostream( os_type ) )
+forall( elt_type | writeable( elt_type ),
+                iterator_type | iterator( iterator_type, elt_type ),
+                os_type & | ostream( os_type ) )
 void write_all( iterator_type begin, iterator_type end, os_type *os );
 forall( otype elt_type | writeable( elt_type ),
                 otype iterator_type | iterator( iterator_type, elt_type ),
                 dtype os_type | ostream( os_type ) )
+forall( elt_type | writeable( elt_type ),
+                iterator_type | iterator( iterator_type, elt_type ),
+                os_type & | ostream( os_type ) )
 void write_all( elt_type begin, iterator_type end, os_type *os ) {
         os << begin;

tests/zombies/new.c

rb6a8b31	rd95969a
14	14	//
15	15
16		forall( ~~otype~~ T )
	16	forall( T )
17	17	void f( T *t ) {
18	18	t--;

tests/zombies/occursError.cfa

-              rb6a8b31
+              rd95969a
 forall( otype T ) void f( void (*)( T, T * ) );
 forall( otype U ) void g( U,  U * );
 forall( otype U ) void h( U *, U );
+forall( T ) void f( void (*)( T, T * ) );
+forall( U ) void g( U,  U * );
+forall( U ) void h( U *, U );
 void test() {

tests/zombies/prolog.c

-              rb6a8b31
+              rd95969a
 void is_integer( int x ) {}
 trait ArithmeticType( otype T ) {
+trait ArithmeticType( T ) {
         void is_arithmetic( T );
 };
 trait IntegralType( otype T | ArithmeticType( T ) ) {
+trait IntegralType( T | ArithmeticType( T ) ) {
         void is_integer( T );
 };
 forall( otype T | IntegralType( T ) | { void printResult( T ); } )
+forall( T | IntegralType( T ) | { void printResult( T ); } )
 void hornclause( T param ) {
         printResult( param );

tests/zombies/quad.c

-              rb6a8b31
+              rd95969a
 #include <fstream.hfa>
 forall( otype T | { T ?*?( T, T ); } )
+forall( T | { T ?*?( T, T ); } )
 T square( T t ) {
         return t * t;
+}
 forall( otype U | { U square( U ); } )
+forall( U | { U square( U ); } )
 U quad( U u ) {
         return square( square( u ) );

tests/zombies/scope.cfa

-              rb6a8b31
+              rd95969a
 y p;
 trait has_u( otype z ) {
+trait has_u( z ) {
         z u(z);
 };
 forall( otype t | has_u( t ) )
+forall( t | has_u( t ) )
 y q( t the_t ) {
         t y = u( the_t );

tests/zombies/simplePoly.c

rb6a8b31	rd95969a
14	14	//
15	15
16		forall( ~~otype T, otype~~ U \| { T f( T, U ); } )
	16	forall( T, U \| { T f( T, U ); } )
17	17	T q( T t, U u ) {
18	18	return f( t, u );

tests/zombies/simpler.c

rb6a8b31	rd95969a
14	14	//
15	15
16		forall( ~~otype~~ T ) T id( T, T );
	16	forall( T ) T id( T, T );
17	17
18	18	int main() {

tests/zombies/specialize.c

rb6a8b31	rd95969a
39	39	}
40	40
41		forall( ~~otype~~ T ) T f( T t )
	41	forall( T ) T f( T t )
42	42	{
43	43	printf( "in f; sizeof T is %d\n", sizeof( T ) );

tests/zombies/square.c

rb6a8b31	rd95969a
16	16	#include <fstream.hfa>
17	17
18		forall( ~~otype~~ T \| { T ?*?( T, T ); } )
	18	forall( T \| { T ?*?( T, T ); } )
19	19	T square( T t ) {
20	20	return t * t;

tests/zombies/structMember.cfa

rb6a8b31	rd95969a
66	66	S.T;
67	67	.S.T;
68		forall( ~~otype S, otype~~ T ) struct W {
	68	forall( S, T ) struct W {
69	69	struct X {};
70	70	};

tests/zombies/subrange.cfa

-              rb6a8b31
+              rd95969a
 // A small context defining the notion of an ordered otype.  (The standard
 // library should probably contain a context for this purpose.)
 trait ordered(otype T) {
+trait ordered(T) {
     int ?<?(T, T), ?<=?(T, T);
 };
 …
 // A subrange otype resembling an Ada subotype with a base otype and a range
 // constraint.
 otype subrange(otype base_t | ordered(base_t), base_t low = 0, base_t high = 8) = base_t;
+otype subrange(base_t | ordered(base_t), base_t low = 0, base_t high = 8) = base_t;
 // Note that subrange() can be applied to floating-point and pointer otypes, not
 …
 // Convenient access to subrange bounds, for instance for iteration:
 forall (otype T, T low, T high)
+forall (T, T low, T high)
 T lbound( subrange(T, low, high) v) {
     return low;
+}
 forall (otype T, T low, T high)
+forall (T, T low, T high)
 T hbound( subrange(T, low, high) v) {
     return high;
 …
 // of exception handling here.  Inlining allows the compiler to eliminate
 // bounds checks.
 forall (otype T | ordered(T), T low, T high)
+forall (T | ordered(T), T low, T high)
 inline subrange(T, low, high) ?=?(subrange(T, low, high)* target, T source) {
     if (low <= source && source <= high) *((T*)target) = source;
 …
 // compares range bounds so that the compiler can optimize checks away when the
 // ranges are known to overlap.
 forall (otype T | ordered(T), T t_low, T t_high, T s_low, T s_high)
+forall (T | ordered(T), T t_low, T t_high, T s_low, T s_high)
 inline subrange(T, t_low, t_high) ?=?(subrange(T, t_low, t_high)* target,
                                       subrange(T, s_low, s_high) source) {

tests/zombies/twice.c

rb6a8b31	rd95969a
16	16	#include <fstream.hfa>
17	17
18		forall( ~~otype~~ T \| { T ?+?( T, T ); } )
	18	forall( T \| { T ?+?( T, T ); } )
19	19	T twice( const T t ) {
20	20	return t + t;

tests/zombies/typeGenerator.cfa

-              rb6a8b31
+              rd95969a
 context addable( otype T ) {
+context addable( T ) {
         T ?+?( T,T );
         T ?=?( T*, T);
 };
 otype List1( otype T | addable( T ) ) = struct { T data; List1( T ) *next; } *;
+otype List1( T | addable( T ) ) = struct { T data; List1( T ) *next; } *;
 typedef List1( int ) ListOfIntegers;
 //List1( int ) li;
 …
 [int] h( * List1( int ) p );                                                    // new declaration syntax
 struct( otype T ) S2 { T i; };                                                  // actual definition
+struct( T ) S2 { T i; };                                                        // actual definition
 struct( int ) S3 v1, *p;                                                                // expansion and instantiation
 struct( otype T )( int ) S24 { T i; } v2;                               // actual definition, expansion and instantiation
 struct( otype T )( int ) { T i; } v2;                                   // anonymous actual definition, expansion and instantiation
+struct( T )( int ) S24 { T i; } v2;                             // actual definition, expansion and instantiation
+struct( T )( int ) { T i; } v2;                                 // anonymous actual definition, expansion and instantiation
 struct( otype T | addable( T ) ) node { T data; struct( T ) node *next; };
 otype List( otype T ) = struct( T ) node *;
+struct( T | addable( T ) ) node { T data; struct( T ) node *next; };
+otype List( T ) = struct( T ) node *;
 List( int ) my_list;

tests/zombies/withStatement.cfa

-              rb6a8b31
+              rd95969a
+}
 forall( otype T )
+forall( T )
 struct Box {
         T x;
 };
 forall( otype T )
+forall( T )
 void ?{}( Box(T) & this ) with( this ) { // with clause in polymorphic function
         x{};
 …
 void print( int i ) { sout | i; }
 forall( otype T | { void print( T ); })
+forall( T | { void print( T ); })
 void foo( T t ) {
         Box( T ) b = { t };

tests/zombies/wrapper/src/pointer.h

rb6a8b31	rd95969a
8	8	// type safe malloc / free
9	9
10		forall(~~otype~~ T)
	10	forall(T)
11	11	T* new()
12	12	{
…	…
16	16	}
17	17
18		forall(~~otype~~ T)
	18	forall(T)
19	19	void delete(T* p)
20	20	{

Context Navigation

Legend: