Changes in / [4468a70:342af53]


Ignore:
Files:
39 added
2 deleted
45 edited

Legend:

Unmodified
Added
Removed
  • Jenkins/FullBuild

    r4468a70 r342af53  
    108108                        string(name: 'GitRef', value: commitId),        \
    109109                        string(name: 'Build' , value: buildNum) \
    110                 ]
     110                ],                                                              \
     111                propagate: false
    111112
    112113        echo(result.result)
  • benchmark/creation/node_cor.js

    r4468a70 r342af53  
    66function * coroutine() { yield }
    77
    8 for ( var i = 0; i < times; i += 1 ) { // warm jit
     8for ( var i = 0; i < times; i += 1 ) { // warm JIT
    99        cor = coroutine()
    1010}
  • benchmark/ctxswitch/node_cor.js

    r4468a70 r342af53  
    1111cor = coroutine()
    1212
    13 for ( var i = 0; i < times; i += 1 ) { // warm git
     13for ( var i = 0; i < times; i += 1 ) { // warm JIT
    1414        cor.next();
    1515}
  • benchmark/io/http/Makefile.am

    r4468a70 r342af53  
    2929EXTRA_PROGRAMS = httpforall .dummy_hack
    3030
     31CLEANFILES = httpforall
     32
    3133nodist_httpforall_SOURCES = \
    3234        filecache.cfa \
  • benchmark/io/http/main.cfa

    r4468a70 r342af53  
    4646}
    4747
     48extern void init_protocol(void);
     49extern void deinit_protocol(void);
     50
    4851//=============================================================================================
    4952// Main
     
    6164        //===================
    6265        // Open Socket
    63         printf("Listening on port %d\n", options.socket.port);
     66        printf("%ld : Listening on port %d\n", getpid(), options.socket.port);
    6467        int server_fd = socket(AF_INET, SOCK_STREAM, 0);
    6568        if(server_fd < 0) {
     
    7982                ret = bind( server_fd, (struct sockaddr *)&address, sizeof(address) );
    8083                if(ret < 0) {
    81                         if(errno == 98) {
     84                        if(errno == EADDRINUSE) {
    8285                                if(waited == 0) {
    8386                                        printf("Waiting for port\n");
     
    109112                options.clopts.instance = &cl;
    110113
     114
    111115                int pipe_cnt = options.clopts.nworkers * 2;
    112116                int pipe_off;
     
    124128                {
    125129                        ServerProc procs[options.clopts.nprocs];
     130
     131                        init_protocol();
    126132                        {
    127133                                Worker workers[options.clopts.nworkers];
     
    151157                                        printf("Shutting Down\n");
    152158                                }
     159
     160                                for(i; options.clopts.nworkers) {
     161                                        printf("Cancelling %p\n", (void*)workers[i].cancel.target);
     162                                        workers[i].done = true;
     163                                        cancel(workers[i].cancel);
     164                                }
     165
     166                                printf("Shutting down socket\n");
     167                                int ret = shutdown( server_fd, SHUT_RD );
     168                                if( ret < 0 ) { abort( "shutdown error: (%d) %s\n", (int)errno, strerror(errno) ); }
     169
     170                                //===================
     171                                // Close Socket
     172                                printf("Closing Socket\n");
     173                                ret = close( server_fd );
     174                                if(ret < 0) {
     175                                        abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
     176                                }
    153177                        }
    154178                        printf("Workers Closed\n");
     179
     180                        deinit_protocol();
    155181                }
    156182
     
    162188                }
    163189                free(fds);
    164         }
    165190
    166         //===================
    167         // Close Socket
    168         printf("Closing Socket\n");
    169         ret = close( server_fd );
    170         if(ret < 0) {
    171                 abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
    172191        }
    173192
  • benchmark/io/http/options.cfa

    r4468a70 r342af53  
    1212#include <parseargs.hfa>
    1313
     14#include <string.h>
     15
    1416Options options @= {
     17        false, // log
     18
    1519        { // file_cache
    1620                0,     // open_flags;
     
    4852                {'p', "port",           "Port the server will listen on", options.socket.port},
    4953                {'c', "cpus",           "Number of processors to use", options.clopts.nprocs},
     54                {'L', "log",            "Enable logs", options.log, parse_settrue},
    5055                {'t', "threads",        "Number of worker threads to use", options.clopts.nworkers},
    5156                {'b', "accept-backlog", "Maximum number of pending accepts", options.socket.backlog},
  • benchmark/io/http/options.hfa

    r4468a70 r342af53  
    88
    99struct Options {
     10        bool log;
     11
    1012        struct {
    1113                int open_flags;
  • benchmark/io/http/protocol.cfa

    r4468a70 r342af53  
    1818#include "options.hfa"
    1919
     20const char * volatile date = 0p;
     21
    2022const char * http_msgs[] = {
    21         "HTTP/1.1 200 OK\nContent-Type: text/plain\nContent-Length: %zu\n\n",
    22         "HTTP/1.1 400 Bad Request\nContent-Type: text/plain\nContent-Length: 0\n\n",
    23         "HTTP/1.1 404 Not Found\nContent-Type: text/plain\nContent-Length: 0\n\n",
    24         "HTTP/1.1 413 Payload Too Large\nContent-Type: text/plain\nContent-Length: 0\n\n",
    25         "HTTP/1.1 414 URI Too Long\nContent-Type: text/plain\nContent-Length: 0\n\n",
     23        "HTTP/1.1 200 OK\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: %zu \n\n",
     24        "HTTP/1.1 400 Bad Request\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
     25        "HTTP/1.1 404 Not Found\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
     26        "HTTP/1.1 413 Payload Too Large\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
     27        "HTTP/1.1 414 URI Too Long\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
    2628};
    2729
     
    4547        while(len > 0) {
    4648                // Call write
    47                 int ret = write(fd, it, len);
     49                int ret = cfa_write(fd, it, len, 0, -1`s, 0p, 0p);
     50                // int ret = write(fd, it, len);
    4851                if( ret < 0 ) { if( errno != EAGAIN && errno != EWOULDBLOCK) abort( "'answer error' error: (%d) %s\n", (int)errno, strerror(errno) ); }
    4952
     
    6366int answer_header( int fd, size_t size ) {
    6467        const char * fmt = http_msgs[OK200];
    65         int len = 100;
     68        int len = 200;
    6669        char buffer[len];
    67         len = snprintf(buffer, len, fmt, size);
     70        len = snprintf(buffer, len, fmt, date, size);
    6871        return answer( fd, buffer, len );
    6972}
    7073
    71 [HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len) {
     74int answer_plain( int fd, char buffer[], size_t size ) {
     75        int ret = answer_header(fd, size);
     76        if( ret < 0 ) return ret;
     77        return answer(fd, buffer, size);
     78}
     79
     80int answer_empty( int fd ) {
     81        return answer_header(fd, 0);
     82}
     83
     84
     85[HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len, io_cancellation * cancel) {
    7286        char * it = buffer;
    7387        size_t count = len - 1;
     
    7589        READ:
    7690        for() {
    77                 int ret = cfa_read(fd, (void*)it, count, 0, -1`s, 0p, 0p);
     91                int ret = cfa_read(fd, (void*)it, count, 0, -1`s, cancel, 0p);
     92                // int ret = read(fd, (void*)it, count);
    7893                if(ret == 0 ) return [OK200, true, 0, 0];
    7994                if(ret < 0 ) {
    8095                        if( errno == EAGAIN || errno == EWOULDBLOCK) continue READ;
     96                        // if( errno == EINVAL ) return [E400, true, 0, 0];
    8197                        abort( "read error: (%d) %s\n", (int)errno, strerror(errno) );
    8298                }
     
    92108        }
    93109
    94         printf("%.*s\n", rlen, buffer);
     110        if( options.log ) printf("%.*s\n", rlen, buffer);
    95111
    96112        it = buffer;
     
    104120
    105121void sendfile( int pipe[2], int fd, int ans_fd, size_t count ) {
     122        unsigned sflags = SPLICE_F_MOVE; // | SPLICE_F_MORE;
    106123        off_t offset = 0;
    107124        ssize_t ret;
    108125        SPLICE1: while(count > 0) {
    109                 ret = cfa_splice(ans_fd, &offset, pipe[1], 0p, count, SPLICE_F_MOVE | SPLICE_F_MORE, 0, -1`s, 0p, 0p);
     126                ret = cfa_splice(ans_fd, &offset, pipe[1], 0p, count, sflags, 0, -1`s, 0p, 0p);
     127                // ret = splice(ans_fd, &offset, pipe[1], 0p, count, sflags);
    110128                if( ret < 0 ) {
    111129                        if( errno != EAGAIN && errno != EWOULDBLOCK) continue SPLICE1;
     
    117135                size_t in_pipe = ret;
    118136                SPLICE2: while(in_pipe > 0) {
    119                         ret = cfa_splice(pipe[0], 0p, fd, 0p, in_pipe, SPLICE_F_MOVE | SPLICE_F_MORE, 0, -1`s, 0p, 0p);
     137                        ret = cfa_splice(pipe[0], 0p, fd, 0p, in_pipe, sflags, 0, -1`s, 0p, 0p);
     138                        // ret = splice(pipe[0], 0p, fd, 0p, in_pipe, sflags);
    120139                        if( ret < 0 ) {
    121140                                if( errno != EAGAIN && errno != EWOULDBLOCK) continue SPLICE2;
     
    127146        }
    128147}
     148
     149//=============================================================================================
     150
     151#include <clock.hfa>
     152#include <time.hfa>
     153#include <thread.hfa>
     154
     155struct date_buffer {
     156        char buff[100];
     157};
     158
     159thread DateFormater {
     160        int idx;
     161        date_buffer buffers[2];
     162};
     163
     164void ?{}( DateFormater & this ) {
     165        ((thread&)this){ "Server Date Thread", *options.clopts.instance };
     166        this.idx = 0;
     167        memset( this.buffers[0].buff, 0, sizeof(this.buffers[0]) );
     168        memset( this.buffers[1].buff, 0, sizeof(this.buffers[1]) );
     169}
     170
     171void main(DateFormater & this) {
     172        LOOP: for() {
     173                waitfor( ^?{} : this) {
     174                        break LOOP;
     175                }
     176                or else {}
     177
     178                Time now = getTimeNsec();
     179
     180                strftime( this.buffers[this.idx].buff, 100, "%a, %d %b %Y %H:%M:%S %Z", now );
     181
     182                char * next = this.buffers[this.idx].buff;
     183                __atomic_exchange_n((char * volatile *)&date, next, __ATOMIC_SEQ_CST);
     184                this.idx = (this.idx + 1) % 2;
     185
     186                sleep(1`s);
     187        }
     188}
     189
     190//=============================================================================================
     191DateFormater * the_date_formatter;
     192
     193void init_protocol(void) {
     194        the_date_formatter = alloc();
     195        (*the_date_formatter){};
     196}
     197
     198void deinit_protocol(void) {
     199        ^(*the_date_formatter){};
     200        free( the_date_formatter );
     201}
  • benchmark/io/http/protocol.hfa

    r4468a70 r342af53  
    11#pragma once
     2
     3struct io_cancellation;
    24
    35enum HttpCode {
     
    1416int answer_error( int fd, HttpCode code );
    1517int answer_header( int fd, size_t size );
     18int answer_plain( int fd, char buffer [], size_t size );
     19int answer_empty( int fd );
    1620
    17 [HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len);
     21[HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len, io_cancellation *);
    1822
    1923void sendfile( int pipe[2], int fd, int ans_fd, size_t count );
  • benchmark/io/http/worker.cfa

    r4468a70 r342af53  
    1919        this.pipe[0] = -1;
    2020        this.pipe[1] = -1;
     21        this.done = false;
     22}
     23
     24extern "C" {
     25extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
    2126}
    2227
     
    2833        CONNECTION:
    2934        for() {
    30                 int fd = cfa_accept4( this.[sockfd, addr, addrlen, flags], 0, -1`s, 0p, 0p );
     35                if( options.log ) printf("=== Accepting connection ===\n");
     36                int fd = cfa_accept4( this.[sockfd, addr, addrlen, flags], 0, -1`s, &this.cancel, 0p );
     37                // int fd = accept4( this.[sockfd, addr, addrlen, flags] );
    3138                if(fd < 0) {
    3239                        if( errno == ECONNABORTED ) break;
     40                        if( errno == EINVAL && this.done ) break;
    3341                        abort( "accept error: (%d) %s\n", (int)errno, strerror(errno) );
    3442                }
    3543
    36                 printf("New connection %d, waiting for requests\n", fd);
     44                if( options.log ) printf("=== New connection %d, waiting for requests ===\n", fd);
    3745                REQUEST:
    3846                for() {
     
    4553                        size_t len = options.socket.buflen;
    4654                        char buffer[len];
    47                         printf("Reading request\n");
    48                         [code, closed, file, name_size] = http_read(fd, buffer, len);
     55                        if( options.log ) printf("=== Reading request ===\n");
     56                        [code, closed, file, name_size] = http_read(fd, buffer, len, &this.cancel);
    4957
    5058                        // if we are done, break out of the loop
    5159                        if( closed ) {
    52                                 printf("Connection closed\n");
     60                                if( options.log ) printf("=== Connection closed ===\n");
     61                                close(fd);
    5362                                continue CONNECTION;
    5463                        }
     
    5665                        // If this wasn't a request retrun 400
    5766                        if( code != OK200 ) {
    58                                 printf("Invalid Request : %d\n", code_val(code));
     67                                printf("=== Invalid Request : %d ===\n", code_val(code));
    5968                                answer_error(fd, code);
    6069                                continue REQUEST;
    6170                        }
    6271
    63                         printf("Request for file %.*s\n", (int)name_size, file);
     72                        if(0 == strncmp(file, "plaintext", min(name_size, sizeof("plaintext") ))) {
     73                                if( options.log ) printf("=== Request for /plaintext ===\n");
     74
     75                                char text[] = "Hello, World!\n";
     76
     77                                // Send the header
     78                                answer_plain(fd, text, sizeof(text));
     79
     80                                if( options.log ) printf("=== Answer sent ===\n");
     81                                continue REQUEST;
     82                        }
     83
     84                        if(0 == strncmp(file, "ping", min(name_size, sizeof("ping") ))) {
     85                                if( options.log ) printf("=== Request for /ping ===\n");
     86
     87                                // Send the header
     88                                answer_empty(fd);
     89
     90                                if( options.log ) printf("=== Answer sent ===\n");
     91                                continue REQUEST;
     92                        }
     93
     94                        if( options.log ) printf("=== Request for file %.*s ===\n", (int)name_size, file);
    6495
    6596                        // Get the fd from the file cache
     
    70101                        // If we can't find the file, return 404
    71102                        if( ans_fd < 0 ) {
    72                                 printf("File Not Found\n");
     103                                printf("=== File Not Found ===\n");
    73104                                answer_error(fd, E404);
    74105                                continue REQUEST;
     
    81112                        sendfile( this.pipe, fd, ans_fd, count);
    82113
    83                         printf("File sent\n");
     114                        if( options.log ) printf("=== Answer sent ===\n");
    84115                }
    85116        }
  • benchmark/io/http/worker.hfa

    r4468a70 r342af53  
    1717        socklen_t * addrlen;
    1818        int flags;
     19        io_cancellation cancel;
     20        volatile bool done;
    1921};
    2022void ?{}( Worker & this);
  • benchmark/io/readv.cfa

    r4468a70 r342af53  
    9696
    9797        char **left;
    98         parse_args( opt, opt_cnt, "[OPTIONS]...\ncforall yield benchmark", left );
     98        parse_args( opt, opt_cnt, "[OPTIONS]...\ncforall readv benchmark", left );
    9999
    100100        if(kpollcp || odirect) {
    101101                if( (buflen % 512) != 0 ) {
    102102                        fprintf(stderr, "Buffer length must be a multiple of 512 when using O_DIRECT, was %lu\n\n", buflen);
    103                         print_args_usage(opt, opt_cnt, "[OPTIONS]...\ncforall yield benchmark", true);
     103                        print_args_usage(opt, opt_cnt, "[OPTIONS]...\ncforall readv benchmark", true);
    104104                }
    105105        }
  • doc/bibliography/pl.bib

    r4468a70 r342af53  
    990990}
    991991
     992@techreport{cfa-cc,
     993    keywords    = {Cforall, cfa-cc, transpiler},
     994    contributer = {pabuhr@plg},
     995    title       = {{\textsf{cfa-cc}} Developer's Reference Manual},
     996    author      = {Fangren Yu},
     997    institution = {School of Computer Science},
     998    address     = {University of Waterloo, Waterloo, Ontario, Canada},
     999    month       = aug,
     1000    year        = {2020},
     1001    note        = {\href{https://cforall.uwaterloo.ca/doc/Fangren_Yu_Report_S20.pdf}{https://\-cforall.uwaterloo.ca/\-doc/\-Fangren\_Yu\_Report\_S20.pdf}},
     1002}
     1003
    9921004@article{Moss18,
    9931005    keywords    = {type systems, polymorphism, tuples, Cforall},
     
    10401052    keywords    = {type system, generic type, resolution algorithm, type environment, Cforall},
    10411053    author      = {Aaron Moss},
    1042     title       = {\textsf{C}$\mathbf{\forall}$ Type System Implementation},
     1054    title       = {\textsf{C}\,$\mathbf{\forall}$ Type System Implementation},
    10431055    school      = {School of Computer Science, University of Waterloo},
    10441056    year        = 2019,
     
    11611173    keywords    = {ctrie, concurrent map},
    11621174    contributer = {a3moss@uwaterloo.ca},
    1163     title       ={Cache-aware lock-free concurrent hash tries},
    1164     author      ={Prokopec, Aleksandar and Bagwell, Phil and Odersky, Martin},
    1165     institution ={EPFL},
    1166     year        ={2011}
     1175    title       = {Cache-aware lock-free concurrent hash tries},
     1176    author      = {Prokopec, Aleksandar and Bagwell, Phil and Odersky, Martin},
     1177    institution = {EPFL},
     1178    year        = {2011}
    11671179}
    11681180
     
    39283940    school      = {School of Computer Science, University of Waterloo},
    39293941    year        = 2003,
    3930     address     = {Waterloo, Ontario, Canada, N2L 3G1},
     3942    optaddress  = {Waterloo, Ontario, Canada, N2L 3G1},
    39313943    note        = {\href{http://plg.uwaterloo.ca/theses/BilsonThesis.pdf}{http://\-plg.uwaterloo.ca/\-theses/\-BilsonThesis.pdf}},
    39323944}
     
    52105222}
    52115223
     5224@manual{gcc-nested-func,
     5225    keywords    = {gcc nested functions},
     5226    contributer = {pabuhr@plg},
     5227    key         = {gcc nested functions},
     5228    title       = {Nested Functions},
     5229    organization= {{gcc} 9.3 Manual},
     5230    year        = 2019,
     5231    note        = {\href{https://gcc.gnu.org/onlinedocs/gcc-9.3.0/gcc/Nested-Functions.html}{https://\-gcc.gnu.org/\-onlinedocs/\-gcc-9.3.0/\-gcc/\-Nested-Functions.html}},
     5232}
     5233
    52125234@article{Haddon77,
    52135235    keywords    = {monitors, nested monitor calls},
     
    74137435}
    74147436
     7437@manual{POSIX17,
     7438    keywords    = {POSIX, Standard},
     7439    contributer = {pabuhr@plg},
     7440    key         = {POSIX},
     7441    title       = {1003.1 Standard for Information Technology -- Portable Operating System Interface (POSIX), Base Specifications, Issue 7},
     7442    organization= {IEEE and The Open Group},
     7443    year        = 2017,
     7444    note        = {\href{https://pubs.opengroup.org/onlinepubs/9699919799}{https://\-pubs.opengroup.org/\-onlinepubs/\-9699919799}},
     7445}
     7446
    74157447@inproceedings{ML:NJ,
    74167448    keywords    = {continuations, ML},
  • doc/theses/andrew_beach_MMath/existing.tex

    r4468a70 r342af53  
    8383the the call site.
    8484
    85 As an example, even if no function named \codeCFA{do\_once} is not defined
    86 near the definition of \codeCFA{do\_twice} the following code will work.
     85As an example, even if no function named \codeCFA{do_once} is not defined
     86near the definition of \codeCFA{do_twice} the following code will work.
    8787\begin{lstlisting}
    8888int quadruple(int x) {
     
    9595\end{lstlisting}
    9696This is not the recommended way to implement a quadruple function but it
    97 does work. The complier will deduce that \codeCFA{do\_twice}'s T is an
     97does work. The complier will deduce that \codeCFA{do_twice}'s T is an
    9898integer from the argument. It will then look for a definition matching the
    99 assertion which is the \codeCFA{do\_once} defined within the function. That
    100 function will be passed in as a function pointer to \codeCFA{do\_twice} and
     99assertion which is the \codeCFA{do_once} defined within the function. That
     100function will be passed in as a function pointer to \codeCFA{do_twice} and
    101101called within it.
    102102
     
    156156In \CFA coroutines are created using the \codeCFA{coroutine} keyword which
    157157works just like \codeCFA{struct} except that the created structure will be
    158 modified by the compiler to satify the \codeCFA{is\_coroutine} trait.
     158modified by the compiler to satify the \codeCFA{is_coroutine} trait.
    159159
    160160These structures act as the interface between callers and the coroutine,
  • doc/theses/fangren_yu_COOP_S20/Report.tex

    r4468a70 r342af53  
    5656
    5757\title{\Huge
    58 cfa-cc Developer's Reference
     58\lstinline|cfa-cc| Developer's Reference
    5959}% title
    6060
     
    728728The \CFA compiler sets a limit on assertion depth and reports an error if assertion resolution does not terminate within the limit (as for \lstinline[language=C++]@templates@ in \CC).
    729729
     730\addcontentsline{toc}{section}{\refname}
    730731\bibliographystyle{plain}
    731732\bibliography{pl}
  • doc/theses/thierry_delisle_PhD/thesis/Makefile

    r4468a70 r342af53  
    88BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex
    99
    10 MAKEFLAGS = --no-print-directory --silent #
     10MAKEFLAGS = --no-print-directory --silent
    1111VPATH = ${Build} ${Figures}
    1212
     
    6666
    6767build/%.dvi : %.tex Makefile | ${Build}
     68        # Conditionally create an empty *.ind (index) file for inclusion until makeindex is run.
     69        if [ ! -r ${basename $@}.ind ] ; then touch ${basename $@}.ind ; fi
    6870        # Must have *.aux file containing citations for bibtex
    6971        if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} $< ; fi
     
    7476        # Make index from *.aux entries and input index at end of document
    7577        -makeglossaries -q -s ${basename $@}.ist ${basename $@}
     78        # Make index from *.aux entries and input index at end of document
     79        -makeindex ${basename $@}.idx
    7680        # Run again to finish citations
    7781        ${LaTeX} $<
  • doc/theses/thierry_delisle_PhD/thesis/text/core.tex

    r4468a70 r342af53  
    11\chapter{Scheduling Core}\label{core}
    22
    3 Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenerio, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the ressources necessary to accomplish the work, \eg, enough workers. In short, the system is neither overloaded nor underloaded.
     3Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers. In short, the system is neither overloaded nor underloaded.
    44
    5 I believe it is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to the new load and return to the steady state, \eg, adding or removing workers. Flaws in the scheduling when the system is in the steady state can therefore to be pervasive in all states.
     5I believe it is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new load and return to the steady state, \eg, by adding or removing workers. Therefore, flaws in scheduling the steady state can to be pervasive in all states.
    66
    77\section{Design Goals}
    8 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer, according to their probable mental model. To match these expectations, the design must offer the programmers sufficient guarantees so that, as long as they respect the mental model, the system will also respect this model.
     8As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental-model. To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental-model, the system also respects this model.
    99
    10 For threading, a simple and common mental model is the ``Ideal multi-tasking CPU'' :
     10For threading, a simple and common execution mental-model is the ``Ideal multi-tasking CPU'' :
    1111
    1212\begin{displayquote}[Linux CFS\cit{https://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt}]
    1313        {[The]} ``Ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 tasks running, then it runs each at 50\% physical power --- i.e., actually in parallel.
     14        \label{q:LinuxCFS}
    1415\end{displayquote}
    1516
    1617Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
    1718
    18 In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with eachother but simply share the hardware. This makes it easier to reason about threading because ready \glspl{thrd} can be taken in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees:
     19In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with each other but simply share the hardware. This assumption makes it easier to reason about threading because ready \glspl{thrd} can be thought of in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees:
    1920\begin{enumerate}
    20         \item A fairness guarantee: a \gls{thrd} that is ready to run will not be prevented to do so by another thread.
    21         \item A performance guarantee: a \gls{thrd} that wants to start or stop running will not be slowed down by other threads wanting to do the same.
     21        \item A fairness guarantee: a \gls{thrd} that is ready to run is not prevented by another thread.
     22        \item A performance guarantee: a \gls{thrd} that wants to start or stop running is not prevented by other threads wanting to do the same.
    2223\end{enumerate}
    2324
    24 It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still need to share a limited amount of hardware. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware, even if that share is very small.
     25It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still share the limited hardware resources. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware resources, even if that share is very small.
    2526
    26 Similarly the performance guarantee, the lack of interferance between threads, is only relevant up to a point. Ideally the cost of running and blocking would be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document the performance experimentation will attempt to show that the cost of scheduling is at worst equivalent to existing algorithms used in popular languages. This demonstration can be made by comparing application built in \CFA to applications built with other languages or other models. Recall from a few paragraphs ago that the expectation of programmers is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is equivalent or lower to other popular languages, I will consider the guarantee achieved.
     27Similarly the performance guarantee, the lack of interference among threads, is only relevant up to a point. Ideally, the cost of running and blocking should be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document, the performance experimentation attempts to show the cost of scheduling is at worst equivalent to existing algorithms used in popular languages. This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models. Recall programmer expectation is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is equivalent to or lower than other popular languages, I consider the guarantee achieved.
    2728
    2829More precisely the scheduler should be:
    2930\begin{itemize}
    3031        \item As fast as other schedulers that are less fair.
    31         \item Faster than other scheduler that have equal or better fairness.
     32        \item Faster than other schedulers that have equal or better fairness.
    3233\end{itemize}
    3334
    3435\subsection{Fairness vs Scheduler Locality}
    35 An important performance factor in modern architectures is cache locality. Waiting for data not present in the cache can have a major impact on performance, and having multiple \glspl{hthrd} writing to the same cache lines can lead to cache lines that need to be waited on again. It is therefore preferable to divide the data among each \gls{hthrd}\footnote{This can be an explicit division up front or using data structures where different \glspl{hthrd} are naturally routed to different cache lines.}.
     36An important performance factor in modern architectures is cache locality. Waiting for data at lower levels or not present in the cache can have a major impact on performance. Having multiple \glspl{hthrd} writing to the same cache lines also leads to cache lines that must be waited on. It is therefore preferable to divide data among each \gls{hthrd}\footnote{This partitioning can be an explicit division up front or using data structures where different \glspl{hthrd} are naturally routed to different cache lines.}.
    3637
    37 For a scheduler, having good locality\footnote{This section discusses \emph{internal} locality, \ie, the locality of the data used by the scheduler. \emph{External locality}, \ie, how the data used by the application is affected by scheduling, is a much more complicated subject and will be discussed in the chapters on evaluation.}, \ie, having the data be local to each \gls{hthrd}, generally conflicts with fairness. Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving \gls{thrd}, and as a consequence cache lines, to \glspl{hthrd} that are currently more appropriate.
     38For a scheduler, having good locality\footnote{This section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling. External locality is a much more complicated subject and is discussed in part~\ref{Evaluation} on evaluation.}, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness. Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available.
    3839
    39 However, I claim that in practice it is possible to strike a balance between fairness and performance because the need for these do not necessarily overlap temporaly. Figure~\ref{fig:fair} shows an visual representation of this behaviour. As mentionned, a little bit of unfairness can be acceptable, therefore it can be desirable to have an algorithm that prioritizes cache locality as long as no threads is left behind for too long.
     40However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally, where Figure~\ref{fig:fair} shows a visual representation of this behaviour. As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental-model.
    4041
    4142\begin{figure}
    42         \begin{center}
    43                 \input{fairness.pstex_t}
    44         \end{center}
    45         \caption{Fairness vs Locality}
     43        \centering
     44        \input{fairness.pstex_t}
     45        \vspace*{-10pt}
     46        \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache, Locality, decreases. At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model. Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.}
    4647        \label{fig:fair}
    47         Rule of thumb graph: Importance of Fairness and Locality while a ready \gls{thrd} waits run.
    48         As the time a ready \gls{thrd} waits increases, ``Ready Time'', the chances that its data is still in cache decreases. At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model mentionned above. Since the actual values and curves of this graph can be highly variable, the graph is left intentionally fuzzy and innacurate.
    4948\end{figure}
    5049
    5150\section{Design}
    52 A naive strictly \glsxtrshort{fifo} ready-queue does not offer sufficient performance. As shown in the evaluation sections, most production schedulers scale when adding multiple \glspl{hthrd} and that is not possible with a single point of contention. Therefore it is vital to shard the ready-queue so that multiple \glspl{hthrd} can access the ready-queue without performance degradation.
     51In general, a na\"{i}ve \glsxtrshort{fifo} ready-queue does not scale with increased parallelism from \glspl{hthrd}, resulting in decreased performance. The problem is adding/removing \glspl{thrd} is a single point of contention. As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}. The common solution to the single point of contention is to shard the ready-queue so each \gls{hthrd} can access the ready-queue without contention, increasing performance though lack of contention.
    5352
    5453\subsection{Sharding} \label{sec:sharding}
    55 An interesting approach to sharding a queue is presented in \cit{Trevors paper}. This algorithm represents a queue with relaxed \glsxtrshort{fifo} guarantee using an array of strictly \glsxtrshort{fifo} sublists as shown in Figure~\ref{fig:base}. Each cell of the array contains a linked-list with a lock and each node in these list is marked with a timestamp indicating when they were added to the list. Push operations are done by picking a random cell and attempting to push to its list. If the cell is already locked, the operation is simply retried on a new cell until a lock is acquired. Pop operations are done in a similar fashion except two random cells are picked. If both cells are not already locked and both cells contain non-empty lists, the operation pops the node with the oldest timestamp. If only one of the cell is unlocked and non-empty, the operation pops from that cell. If both cells are either locked or empty, the operation picks two new cells and tries again.
     54An interesting approach to sharding a queue is presented in \cit{Trevors paper}. This algorithm presents a queue with a relaxed \glsxtrshort{fifo} guarantee using an array of strictly \glsxtrshort{fifo} sublists as shown in Figure~\ref{fig:base}. Each \emph{cell} of the array has a timestamp for the last operation and a pointer to a linked-list with a lock and each node in the list is marked with a timestamp indicating when it is added to the list. A push operation is done by picking a random cell, acquiring the list lock, and pushing to the list. If the cell is locked, the operation is simply retried on another random cell until a lock is acquired. A pop operation is done in a similar fashion except two random cells are picked. If both cells are unlocked with non-empty lists, the operation pops the node with the oldest cell timestamp. If one of the cells is unlocked and non-empty, the operation pops from that cell. If both cells are either locked or empty, the operation picks two new random cells and tries again.
    5655
    5756\begin{figure}
    58         \begin{center}
    59                 \input{base.pstex_t}
    60         \end{center}
    61         \caption{Relaxed FIFO list}
     57        \centering
     58        \input{base.pstex_t}
     59        \caption[Relaxed FIFO list]{Relaxed FIFO list \smallskip\newline List at the base of the scheduler: an array of strictly FIFO lists. The timestamp is in all nodes and cell arrays.}
    6260        \label{fig:base}
    63         List at the base of the scheduler: an array of strictly FIFO lists.
    64         The timestamp is in all nodes and cell arrays.
    6561\end{figure}
    6662
    6763\subsection{Finding threads}
    68 Once threads have been distributed onto multiple queues, indentifying which queues are empty and which are not can become a problem. Indeed, if the number of \glspl{thrd} does not far exceed the number of queues, it is probable that several of these queues are empty. Figure~\ref{fig:empty} shows an example with 2 \glspl{thrd} running on 8 queues, where the chances of getting an empty queue is 75\% per pick, meaning two random picks yield a \gls{thrd} only half the time.
    69 
     64Once threads have been distributed onto multiple queues, identifying empty queues becomes a problem. Indeed, if the number of \glspl{thrd} does not far exceed the number of queues, it is probable that several of the cell queues are empty. Figure~\ref{fig:empty} shows an example with 2 \glspl{thrd} running on 8 queues, where the chances of getting an empty queue is 75\% per pick, meaning two random picks yield a \gls{thrd} only half the time. This scenario leads to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
    7065
    7166\begin{figure}
    72         \begin{center}
    73                 \input{empty.pstex_t}
    74         \end{center}
    75         \caption{``More empty'' Relaxed FIFO list}
     67        \centering
     68        \input{empty.pstex_t}
     69        \caption[``More empty'' Relaxed FIFO list]{``More empty'' Relaxed FIFO list \smallskip\newline Emptier state of the queue: the array contains many empty cells, that is strictly FIFO lists containing no elements.}
    7670        \label{fig:empty}
    77         Emptier state of the queue: the array contains many empty cells, that is strictly FIFO lists containing no elements.
    7871\end{figure}
    7972
    80 This can lead to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
     73There are several solutions to this problem, but they ultimately all have to encode if a cell has an empty list. My results show the density and locality of this encoding is generally the dominating factor in these scheme. Classic solutions to this problem use one of three techniques to encode the information:
    8174
    82 Solutions to this problem can take many forms, but they ultimately all have to encode where the threads are in some form. My results show that the density and locality of this encoding is generally the dominating factor in these scheme. Classic solutions to this problem use one of three techniques to encode the information:
     75\paragraph{Dense Information} Figure~\ref{fig:emptybit} shows a dense bitmask to identify the cell queues currently in use. This approach means processors can often find \glspl{thrd} in constant time, regardless of how many underlying queues are empty. Furthermore, modern x86 CPUs have extended bit manipulation instructions (BMI2) that allow searching the bitmask with very little overhead compared to the randomized selection approach for a filled ready queue, offering good performance even in cases with many empty inner queues. However, this technique has its limits: with a single word\footnote{Word refers here to however many bits can be written atomically.} bitmask, the total amount of ready-queue sharding is limited to the number of bits in the word. With a multi-word bitmask, this maximum limit can be increased arbitrarily, but the look-up time increases. Finally, a dense bitmap, either single or multi-word, causes additional contention problems that reduces performance because of cache misses after updates. This central update bottleneck also means the information in the bitmask is more often stale before a processor can use it to find an item, \ie mask read says there are available \glspl{thrd} but none on queue when the subsequent atomic check is done.
    8376
    8477\begin{figure}
    85         \begin{center}
    86                 {\resizebox{0.73\textwidth}{!}{\input{emptybit.pstex_t}}}
    87         \end{center}
     78        \centering
    8879        \vspace*{-5pt}
    89         \caption{Underloaded queue with added bitmask to indicate which array cells have items.}
     80        {\resizebox{0.75\textwidth}{!}{\input{emptybit.pstex_t}}}
     81        \vspace*{-5pt}
     82        \caption[Underloaded queue with bitmask]{Underloaded queue with bitmask indicating array cells with items.}
    9083        \label{fig:emptybit}
    91         \begin{center}
    92                 {\resizebox{0.73\textwidth}{!}{\input{emptytree.pstex_t}}}
    93         \end{center}
     84
     85        \vspace*{10pt}
     86        {\resizebox{0.75\textwidth}{!}{\input{emptytree.pstex_t}}}
    9487        \vspace*{-5pt}
    95         \caption{Underloaded queue with added binary search tree indicate which array cells have items.}
     88        \caption[Underloaded queue with binary search-tree]{Underloaded queue with binary search-tree indicating array cells with items.}
    9689        \label{fig:emptytree}
    97         \begin{center}
    98                 {\resizebox{0.9\textwidth}{!}{\input{emptytls.pstex_t}}}
    99         \end{center}
     90
     91        \vspace*{10pt}
     92        {\resizebox{0.95\textwidth}{!}{\input{emptytls.pstex_t}}}
    10093        \vspace*{-5pt}
    101         \caption{Underloaded queue with added per processor bitmask to indicate which array cells have items.}
     94        \caption[Underloaded queue with per processor bitmask]{Underloaded queue with per processor bitmask indicating array cells with items.}
    10295        \label{fig:emptytls}
    10396\end{figure}
    10497
    105 \paragraph{Dense Information} Figure~\ref{fig:emptybit} shows a dense bitmask to identify which inner queues are currently in use. This approach means processors can often find \glspl{thrd} in constant time, regardless of how many underlying queues are empty. Furthermore, modern x86 CPUs have extended bit manipulation instructions (BMI2) that allow using the bitmask with very little overhead compared to the randomized selection approach for a filled ready queue, offering good performance even in cases with many empty inner queues. However, this technique has its limits: with a single word\footnote{Word refers here to however many bits can be written atomically.} bitmask, the total number of underlying queues in the ready queue is limited to the number of bits in the word. With a multi-word bitmask, this maximum limit can be increased arbitrarily, but the look-up will nolonger be constant time. Finally, a dense bitmap, either single or multi-word, causes additional contention problems which reduces performance because of cache misses after updates. This central update bottleneck also means the information in the bitmask is more often stale before a processor can use it to find an item, \ie mask read says there are available \glspl{thrd} but none on queue.
     98\paragraph{Sparse Information} Figure~\ref{fig:emptytree} shows an approach using a hierarchical tree data-structure to reduce contention and has been shown to work in similar cases~\cite{ellen2007snzi}. However, this approach may lead to poorer performance due to the inherent pointer chasing cost while still allowing significant contention on the nodes of the tree if the tree is shallow.
    10699
    107 \paragraph{Sparse Information} Figure~\ref{fig:emptytree} shows an approach using a hierarchical tree data-structure to reduce contention and has been shown to work in similar cases~\cite{ellen2007snzi}. However, this approach may lead to poorer performance due to the inherent pointer chasing cost while still allowing more contention on the nodes of the tree if the tree is not deep enough.
     100\paragraph{Local Information} Figure~\ref{fig:emptytls} shows an approach using dense information, similar to the bitmap, but each \gls{hthrd} keeps its own independent copy. While this approach can offer good scalability \emph{and} low latency, the liveliness and discovery of the information can become a problem. This case is made worst in systems with few processors where even blind random picks can find \glspl{thrd} in a few tries.
    108101
    109 \paragraph{Local Information} Figure~\ref{fig:emptytls} shows an approach using dense information, similar to the bitmap, but have each thread keep its own independent copy of it. While this approach can offer good scalability \emph{and} low latency, the liveliness and discovery of the information can become a problem. This case is made worst in systems with few processors where even blind random picks can find \glspl{thrd} in few tries.
    110 
    111 I built a prototype of these approach and none of these techniques offer satisfying performance when few threads are present. All of these approach hit the same 2 problems. First, blindly picking two sub-queues is very fast which means that any improvement to the hit rate can easily be countered by a slow-down in look-up speed. Second, the array is already as sharded as is needed to avoid contention bottlenecks, so any denser data structure will tend to become a bottleneck. In all cases, these factors meant that the best cases scenerio, many threads, would get worst throughput and the worst case scenario, few threads, would get a better hit rate, but an equivalent throughput. As a result I tried an entirely different approach.
     102I built a prototype of these approaches and none of these techniques offer satisfying performance when few threads are present. All of these approach hit the same 2 problems. First, randomly picking sub-queues is very fast but means any improvement to the hit rate can easily be countered by a slow-down in look-up speed when there are empty lists. Second, the array is already as sharded to avoid contention bottlenecks, so any denser data structure tends to become a bottleneck. In all cases, these factors meant the best cases scenario, \ie many threads, would get worst throughput, and the worst-case scenario, few threads, would get a better hit rate, but an equivalent poor throughput. As a result I tried an entirely different approach.
    112103
    113104\subsection{Dynamic Entropy}\cit{https://xkcd.com/2318/}
    114 In the worst case scenario there are few \glspl{thrd} ready to run, or more accuratly given $P$ \glspl{proc}, $T$ \glspl{thrd} and $\epsilon$, as usual, a very small number, in this case $\epsilon \ll P$, we have $T = P + \epsilon$. An important thing to note is that in this case, fairness is effectively irrelevant. Indeed, this case is close to \emph{actually matching} the model of the ``Ideal multi-tasking CPU'' presented in this chapter\footnote{For simplicity, this assumes there is a one-to-one match between \glspl{proc} and \glspl{hthrd}.}. Therefore, in this context it is possible to use a purely internal locality based approach and still meet the fairness requirements. This approach would simply have each \gls{proc} running a single \gls{thrd} repeatedly. Or from the shared ready-queue viewpoint, each \gls{proc} would push to a given sub-queue and then pop from the \emph{same} subqueue. Ideally, the scheduler would achieve this without affecting the fairness guarantees in cases where $T \gg P$.
     105In the worst-case scenario there are only few \glspl{thrd} ready to run, or more precisely given $P$ \glspl{proc}\footnote{For simplicity, this assumes there is a one-to-one match between \glspl{proc} and \glspl{hthrd}.}, $T$ \glspl{thrd} and $\epsilon$ a very small number, than the worst case scenario can be represented by $\epsilon \ll P$, than $T = P + \epsilon$. It is important to note in this case that fairness is effectively irrelevant. Indeed, this case is close to \emph{actually matching} the model of the ``Ideal multi-tasking CPU'' on page \pageref{q:LinuxCFS}. In this context, it is possible to use a purely internal-locality based approach and still meet the fairness requirements. This approach simply has each \gls{proc} running a single \gls{thrd} repeatedly. Or from the shared ready-queue viewpoint, each \gls{proc} pushes to a given sub-queue and then popes from the \emph{same} subqueue. In cases where $T \gg P$, the scheduler should also achieves similar performance without affecting the fairness guarantees.
    115106
    116 To achieve this, I use a communication channel I have not mentionned yet and which I believe I use in a novel way : the \glsxtrshort{prng}. If the scheduler has a \glsxtrshort{prng} instance per \gls{proc} exclusively used for scheduling, its seed effectively encodes a list of all the accessed subqueues, from the latest to the oldest. The only requirement to achieve this is to be able to ``replay'' the \glsxtrshort{prng} backwards. As it turns out, this is an entirely reasonnable requirement and there already exist \glsxtrshort{prng}s that are fast, compact \emph{and} can be run forward and backwards. Linear congruential generators\cite{wiki:lcg} are an example of \glsxtrshort{prng}s that match these requirements.
     107To handle this case, I use a pseudo random-number generator, \glsxtrshort{prng} in a novel way. When the scheduler uses a \glsxtrshort{prng} instance per \gls{proc} exclusively, the random-number seed effectively starts an encoding that produces a list of all accessed subqueues, from latest to oldest. The novel approach is to be able to ``replay'' the \glsxtrshort{prng} backwards and there exist \glsxtrshort{prng}s that are fast, compact \emph{and} can be run forward and backwards. Linear congruential generators~\cite{wiki:lcg} are an example of \glsxtrshort{prng}s that match these requirements.
    117108
    118 The algorithm works as follows :
     109The algorithm works as follows:
    119110\begin{itemize}
    120111        \item Each \gls{proc} has two \glsxtrshort{prng} instances, $F$ and $B$.
    121         \item Push and Pop operations happen as mentionned in Section~\ref{sec:sharding} with the following exceptions:
     112        \item Push and Pop operations occur as discussed in Section~\ref{sec:sharding} with the following exceptions:
    122113        \begin{itemize}
    123114                \item Push operations use $F$ going forward on each try and on success $F$ is copied into $B$.
     
    126117\end{itemize}
    127118
    128 The main benefit of this technique is that it basically repects the desired properties of Figure~\ref{fig:fair}. When looking for work, \glspl{proc} will look first at the last cells they pushed to, if any, and then move backwards through the cells. As the \glspl{proc} continue looking for work, $F$ moves back and $B$ stays in place. As a result the relation between the two becomes weaker, which means that the probablisitic fairness of the algorithm reverts to normal. Chapter~\ref{proofs} discusses more formally the fairness guarantees of this algorithm.
     119The main benefit of this technique is that it basically respects the desired properties of Figure~\ref{fig:fair}. When looking for work, a \gls{proc} first looks at the last cell they pushed to, if any, and then move backwards through its accessed cells. As the \gls{proc} continues looking for work, $F$ moves backwards and $B$ stays in place. As a result, the relation between the two becomes weaker, which means that the probablisitic fairness of the algorithm reverts to normal. Chapter~\ref{proofs} discusses more formally the fairness guarantees of this algorithm.
    129120
    130121\section{Details}
  • doc/theses/thierry_delisle_PhD/thesis/text/io.tex

    r4468a70 r342af53  
    55Since \glsxtrshort{io} operations are generally handled by the
    66
    7 \subsection{\texttt{epoll}, \texttt{poll} and \texttt{select}}
     7\subsection{\lstinline|epoll|, \lstinline|poll| and \lstinline|select|}
    88
    99\subsection{Linux's AIO}
     
    3333
    3434\subsection{\texttt{io\_uring}}
    35 A very recent addition to Linux, \texttt{io\_uring}\cit{io\_uring} is a framework that aims to solve many of the problems listed with the above mentioned solutions.
     35A very recent addition to Linux, @io_uring@\cit{io\_uring} is a framework that aims to solve many of the problems listed with the above mentioned solutions.
    3636
    3737\subsection{Extra Kernel Threads}\label{io:morethreads}
  • doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex

    r4468a70 r342af53  
    11\chapter{\CFA Runtime}
    2 This chapter offers an overview of the capabilities of the \CFA runtime prior to this work.
     2This chapter presents an overview of the capabilities of the \CFA runtime prior to this thesis work.
    33
    4 Threading in \CFA offers is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, that is: programmers should be able to create a large number of \glspl{thrd} and switch between \glspl{thrd} liberally without many concerns for performance.
     4\Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@. Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC). While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC. This model uses \glspl{kthrd} to achieve parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completely true due to primitives like \lstinline|futex|es, which have a significant portion of their logic in user space.}.
    55
    66\section{M:N Threading}\label{prev:model}
    77
    8 C traditionnally uses a 1:1 threading model. This model uses \glspl{kthrd} to achive parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completly true due to primitives like \texttt{futex}es, which have a significant portion of their logic in user space.}.
     8Threading in \CFA is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \glspl{thrd} and switch among \glspl{thrd} liberally without many concerns for performance.
    99
    10 By contrast \CFA uses an M:N threading models, where concurrency is achieved using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model, they represent an independant thread of execution with it's on stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then choses a different \gls{thrd} to run.
     10The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run.
    1111
    1212\section{Clusters}
     13\CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} are only be scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. It also opens the door to handling effects like NUMA, by pining clusters to a specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
     14
    1315\begin{figure}
    1416        \begin{center}
    1517                \input{system.pstex_t}
    1618        \end{center}
    17         \caption{Overview of the \CFA runtime}
     19        \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.}
    1820        \label{fig:system}
    19         \Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.
    2021\end{figure}
    21 \CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} will only be scheduled onto \glspl{proc} in the same cluster and scheduling is done independantly of other clusters. Figure~\ref{fig:system} shows an overview if this system. This allows programmers to control more tightly parallelism. It also opens the door to handling effects like NUMA, by pining clusters to specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
    2222
    2323\section{Scheduling}
    24 The \CFA runtime was previously using a strictly \glsxtrshort{fifo} ready queue with a single lock. This setup offers perfect fairness in terms of opportunities to run/ However, it offers poor scalability, since the performance of the ready queue can never be improved by adding more \glspl{hthrd}, but the contention can cause significant performance degradation.
     24The \CFA runtime previously used a \glsxtrshort{fifo} ready-queue with a single lock. This setup offers perfect fairness in terms of opportunities to run. However, it offers poor scalability, since the performance of the ready queue can never be improved by adding more \glspl{hthrd} and contention can cause significant performance degradation.
    2525
    2626\section{\glsxtrshort{io}}\label{prev:io}
    27 Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. \CFA being built on C, this means that, while all the operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model\cit{pthreads}. Using these operations in a M:N threading model, when they are built for 1:1 threading, means that operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. This also means that deadlocks can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
     27Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. %\CFA being built on C, this means that,
     28While all I/O operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means I/O operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
     29\begin{quote}
     30Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request. With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
     31\end{quote}
     32Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, like \glslink{uthrding}{User-Level \emph{Threading}} blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations, which entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple I/O operations in parallel. This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. Executing I/O operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block.
    2833
    29 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, the first \gls{thrd} is still ready to run and should just be able to get CPU time and send the request. In practice with M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} in this example will \emph{not} try to run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is effectively deadlocked\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
     34\section{Interoperating with C}
     35While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the non-blocking challenge extends to all blocking system-calls. The POSIX standard states~\cite[\S~2.9.1]{POSIX17}:
     36\begin{quote}
     37All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions1 need not be thread-safe. ... (list of 70+ potentially excluded functions)
     38\end{quote}
     39Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthread lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
    3040
    31 One of the objective of this work, is to introduce \emph{User-Level \glsxtrshort{io}} which, as a parallel to \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations. This entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple operations in parallel. This cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its duration. Executing operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} is not blocked.
     41Languages like Go and Java, which have strict interoperability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them, \eg a blocking function may be delegated to a \gls{kthrd}. Sandboxing may help towards guaranteeing that the kind of deadlock mentioned above does not occur.
    3242
    33 \section{Interoperating with \texttt{C}}
    34 While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the challenges mentioned in the previous section do not require \glsxtrshort{io} to be involved. These challenges are a product of blocking system calls rather than \glsxtrshort{io}. C offers no tools to identify whether or not a librairy function will lead to a blocking system call. This fact means interoperatability with C becomes a challenge in a M:N threading model.
    35 
    36 Languages like Go and Java, which have strict interoperatability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them. They can, for example, delegate C operations to \glspl{kthrd} that are not \glspl{proc}. Sandboxing may help towards guaranteeing that the deadlocks mentioned in the previous section do not occur.
    37 
    38 As mentioned in Section~\cit{\CFA intro}, \CFA is binary compatible with C and, as such, trivially supports calls to and from C librairies. Furthermore, interoperatability can happen within a single library, through inline code or simply C and \CFA translation units archived together. The fine-grained interoperatability between C and \CFA has two consequences:
     43As mentioned in Section~\ref{intro}, \CFA is binary compatible with C and, as such, must support all C library functions. Furthermore, interoperability can happen at the function-call level, inline code, or C and \CFA translation units linked together. This fine-grained interoperability between C and \CFA has two consequences:
    3944\begin{enumerate}
    40         \item Precisely identifying C calls that could block is difficult.
    41         \item Introducing code where interoperatability occurs could have a significant impact on general performance.
     45        \item Precisely identifying blocking C calls is difficult.
     46        \item Introducing new code can have a significant impact on general performance.
    4247\end{enumerate}
    43 
    44 Because of these consequences, this work does not attempt to ``sandbox'' calls to C. It is possible that conflicting calls to C could lead to deadlocks on \CFA's M:N threading model where they would not in the traditionnal 1:1 threading model. However, I judge that solving this problem in general, in a way that is composable and flexible, is too complex in itself and would add too much work to this thesis. Therefore it is outside the scope of this thesis.
     48Because of these consequences, this work does not attempt to ``sandbox'' calls to C. Therefore, it is possible for an unidentified library calls to block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. Therefore, a complete solution to this problem is outside the scope of this thesis.
  • doc/theses/thierry_delisle_PhD/thesis/thesis.tex

    r4468a70 r342af53  
    120120% although it's supposed to be in both the TeX Live and MikTeX distributions. There are also documentation and
    121121% installation instructions there.
     122\renewcommand*{\glstextformat}[1]{\textsf{#1}}
    122123
    123124\usepackage{csquotes}
     
    125126
    126127% Setting up the page margins...
     128\setlength{\textheight}{9in}\setlength{\topmargin}{-0.45in}\setlength{\headsep}{0.25in}
    127129% uWaterloo thesis requirements specify a minimum of 1 inch (72pt) margin at the
    128130% top, bottom, and outside page edges and a 1.125 in. (81pt) gutter
     
    191193% cfa macros used in the document
    192194\input{common}
     195\CFAStyle                                               % CFA code-style for all languages
     196\lstset{basicstyle=\linespread{0.9}\tt}
    193197
    194198% glossary of terms to use
    195199\input{glossary}
     200\makeindex
    196201
    197202%======================================================================
     
    230235\input{text/io.tex}
    231236\part{Evaluation}
     237\label{Evaluation}
    232238\chapter{Theoretical and Existance Proofs}
    233239\chapter{Micro-Benchmarks}
     
    256262\addcontentsline{toc}{chapter}{\textbf{References}}
    257263
    258 \bibliography{local}
     264\bibliography{local,pl}
    259265% Tip 5: You can create multiple .bib files to organize your references.
    260266% Just list them all in the \bibliogaphy command, separated by commas (no spaces).
     
    301307\printglossary
    302308\cleardoublepage
     309
     310% Index
     311% -----------------------------
     312%\input{thesis.ind}                             % index
     313
    303314\phantomsection
    304315
  • example/io/simple/server_epoll.c

    r4468a70 r342af53  
    8888      }
    8989
    90       ev.events = EPOLLIN | EPOLLONESHOT;
     90      ev.events = EPOLLOUT | EPOLLIN | EPOLLONESHOT;
    9191      ev.data.u64 = (uint64_t)&ring;
    9292      if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ring.ring_fd, &ev) == -1) {
     
    9999
    100100        while(1) {
    101             BLOCK:
     101            BLOCK:;
    102102            int nfds = epoll_wait(epollfd, events, MAX_EVENTS, -1);
    103103            if (nfds == -1) {
  • libcfa/src/Makefile.am

    r4468a70 r342af53  
    5858        concurrency/iofwd.hfa \
    5959        containers/list.hfa \
    60         containers/stackLockFree.hfa
     60        containers/stackLockFree.hfa \
     61        vec/vec.hfa \
     62        vec/vec2.hfa \
     63        vec/vec3.hfa \
     64        vec/vec4.hfa
    6165
    6266inst_headers_src = \
     
    9498        concurrency/clib/cfathread.h \
    9599        concurrency/invoke.h \
     100        concurrency/future.hfa \
    96101        concurrency/kernel/fwd.hfa
    97102
  • libcfa/src/bits/locks.hfa

    r4468a70 r342af53  
    283283                void ^?{}(future_t &) {}
    284284
     285                void reset(future_t & this) {
     286                        // needs to be in 0p or 1p
     287                        __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
     288                }
     289
    285290                // check if the future is available
    286291                bool available( future_t & this ) {
     
    340345
    341346                // Mark the future as abandoned, meaning it will be deleted by the server
    342                 void abandon( future_t & this ) {
     347                bool abandon( future_t & this ) {
     348                        /* paranoid */ verify( this.ptr != 3p );
     349
     350                        // Mark the future as abandonned
    343351                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
     352
     353                        // If the future isn't already fulfilled, let the server delete it
     354                        if( got == 0p ) return false;
    344355
    345356                        // got == 2p: the future is ready but the context hasn't fully been consumed
     
    347358                        if( got == 2p ) {
    348359                                while( this.ptr != 1p ) Pause();
    349                         }
    350                         return;
     360                                got = 1p;
     361                        }
     362
     363                        // The future is completed delete it now
     364                        /* paranoid */ verify( this.ptr != 1p );
     365                        free( &this );
     366                        return true;
    351367                }
    352368
  • libcfa/src/concurrency/io.cfa

    r4468a70 r342af53  
    3131
    3232        extern "C" {
    33                 #include <sys/epoll.h>
    3433                #include <sys/syscall.h>
    3534
     
    4140        #include "kernel/fwd.hfa"
    4241        #include "io/types.hfa"
     42
     43        static const char * opcodes[] = {
     44                "OP_NOP",
     45                "OP_READV",
     46                "OP_WRITEV",
     47                "OP_FSYNC",
     48                "OP_READ_FIXED",
     49                "OP_WRITE_FIXED",
     50                "OP_POLL_ADD",
     51                "OP_POLL_REMOVE",
     52                "OP_SYNC_FILE_RANGE",
     53                "OP_SENDMSG",
     54                "OP_RECVMSG",
     55                "OP_TIMEOUT",
     56                "OP_TIMEOUT_REMOVE",
     57                "OP_ACCEPT",
     58                "OP_ASYNC_CANCEL",
     59                "OP_LINK_TIMEOUT",
     60                "OP_CONNECT",
     61                "OP_FALLOCATE",
     62                "OP_OPENAT",
     63                "OP_CLOSE",
     64                "OP_FILES_UPDATE",
     65                "OP_STATX",
     66                "OP_READ",
     67                "OP_WRITE",
     68                "OP_FADVISE",
     69                "OP_MADVISE",
     70                "OP_SEND",
     71                "OP_RECV",
     72                "OP_OPENAT2",
     73                "OP_EPOLL_CTL",
     74                "OP_SPLICE",
     75                "OP_PROVIDE_BUFFERS",
     76                "OP_REMOVE_BUFFERS",
     77                "OP_TEE",
     78                "INVALID_OP"
     79        };
    4380
    4481        // returns true of acquired as leader or second leader
     
    134171                int ret = 0;
    135172                if( need_sys_to_submit || need_sys_to_complete ) {
     173                        __cfadbg_print_safe(io_core, "Kernel I/O : IO_URING enter %d %u %u\n", ring.fd, to_submit, flags);
    136174                        ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, 0, flags, (sigset_t *)0p, _NSIG / 8);
    137175                        if( ret < 0 ) {
     
    157195        static unsigned __collect_submitions( struct __io_data & ring );
    158196        static __u32 __release_consumed_submission( struct __io_data & ring );
    159 
    160         static inline void process(struct io_uring_cqe & cqe ) {
     197        static inline void __clean( volatile struct io_uring_sqe * sqe );
     198
     199        // Process a single completion message from the io_uring
     200        // This is NOT thread-safe
     201        static inline void process( volatile struct io_uring_cqe & cqe ) {
    161202                struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
    162203                __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
     
    165206        }
    166207
    167         // Process a single completion message from the io_uring
    168         // This is NOT thread-safe
    169208        static [int, bool] __drain_io( & struct __io_data ring ) {
    170209                /* paranoid */ verify( ! __preemption_enabled() );
     
    192231                }
    193232
     233                __atomic_thread_fence( __ATOMIC_SEQ_CST );
     234
    194235                // Release the consumed SQEs
    195236                __release_consumed_submission( ring );
     
    209250                for(i; count) {
    210251                        unsigned idx = (head + i) & mask;
    211                         struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
     252                        volatile struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
    212253
    213254                        /* paranoid */ verify(&cqe);
     
    218259                // Mark to the kernel that the cqe has been seen
    219260                // Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
    220                 __atomic_thread_fence( __ATOMIC_SEQ_CST );
    221                 __atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
     261                __atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_SEQ_CST );
    222262
    223263                return [count, count > 0 || to_submit > 0];
     
    225265
    226266        void main( $io_ctx_thread & this ) {
    227                 epoll_event ev;
    228                 __ioctx_register( this, ev );
    229 
    230                 __cfadbg_print_safe(io_core, "Kernel I/O : IO poller %p for ring %p ready\n", &this, &this.ring);
    231 
    232                 int reset = 0;
     267                __ioctx_register( this );
     268
     269                __cfadbg_print_safe(io_core, "Kernel I/O : IO poller %d (%p) ready\n", this.ring->fd, &this);
     270
     271                const int reset_cnt = 5;
     272                int reset = reset_cnt;
    233273                // Then loop until we need to start
     274                LOOP:
    234275                while(!__atomic_load_n(&this.done, __ATOMIC_SEQ_CST)) {
    235276                        // Drain the io
     
    239280                                [count, again] = __drain_io( *this.ring );
    240281
    241                                 if(!again) reset++;
     282                                if(!again) reset--;
    242283
    243284                                // Update statistics
     
    249290
    250291                        // If we got something, just yield and check again
    251                         if(reset < 5) {
     292                        if(reset > 1) {
    252293                                yield();
    253                         }
    254                         // We didn't get anything baton pass to the slow poller
    255                         else {
     294                                continue LOOP;
     295                        }
     296
     297                        // We alread failed to find completed entries a few time.
     298                        if(reset == 1) {
     299                                // Rearm the context so it can block
     300                                // but don't block right away
     301                                // we need to retry one last time in case
     302                                // something completed *just now*
     303                                __ioctx_prepare_block( this );
     304                                continue LOOP;
     305                        }
     306
    256307                                __STATS__( false,
    257308                                        io.complete_q.blocks += 1;
    258309                                )
    259                                 __cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %p\n", &this.self);
    260                                 reset = 0;
     310                                __cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %d (%p)\n", this.ring->fd, &this);
    261311
    262312                                // block this thread
    263                                 __ioctx_prepare_block( this, ev );
    264313                                wait( this.sem );
    265                         }
    266                 }
    267 
    268                 __cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p stopping\n", &this.ring);
     314
     315                        // restore counter
     316                        reset = reset_cnt;
     317                }
     318
     319                __cfadbg_print_safe(io_core, "Kernel I/O : Fast poller %d (%p) stopping\n", this.ring->fd, &this);
    269320        }
    270321
     
    289340//
    290341
    291         [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
     342        // Allocate an submit queue entry.
     343        // The kernel cannot see these entries until they are submitted, but other threads must be
     344        // able to see which entries can be used and which are already un used by an other thread
     345        // for convenience, return both the index and the pointer to the sqe
     346        // sqe == &sqes[idx]
     347        [* volatile struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
    292348                /* paranoid */ verify( data != 0 );
    293349
     
    304360                        // Look through the list starting at some offset
    305361                        for(i; cnt) {
    306                                 __u64 expected = 0;
    307                                 __u32 idx = (i + off) & mask;
    308                                 struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
     362                                __u64 expected = 3;
     363                                __u32 idx = (i + off) & mask; // Get an index from a random
     364                                volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
    309365                                volatile __u64 * udata = &sqe->user_data;
    310366
     367                                // Allocate the entry by CASing the user_data field from 0 to the future address
    311368                                if( *udata == expected &&
    312369                                        __atomic_compare_exchange_n( udata, &expected, data, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) )
     
    319376                                        )
    320377
     378                                        // debug log
     379                                        __cfadbg_print_safe( io, "Kernel I/O : allocated [%p, %u] for %p (%p)\n", sqe, idx, active_thread(), (void*)data );
    321380
    322381                                        // Success return the data
     
    325384                                verify(expected != data);
    326385
     386                                // This one was used
    327387                                len ++;
    328388                        }
    329389
    330390                        block++;
     391
     392                        abort( "Kernel I/O : all submit queue entries used, yielding\n" );
     393
    331394                        yield();
    332395                }
     
    377440        void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1))) {
    378441                __io_data & ring = *ctx->thrd.ring;
     442
     443                {
     444                        __attribute__((unused)) volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
     445                        __cfadbg_print_safe( io,
     446                                "Kernel I/O : submitting %u (%p) for %p\n"
     447                                "    data: %p\n"
     448                                "    opcode: %s\n"
     449                                "    fd: %d\n"
     450                                "    flags: %d\n"
     451                                "    prio: %d\n"
     452                                "    off: %p\n"
     453                                "    addr: %p\n"
     454                                "    len: %d\n"
     455                                "    other flags: %d\n"
     456                                "    splice fd: %d\n"
     457                                "    pad[0]: %llu\n"
     458                                "    pad[1]: %llu\n"
     459                                "    pad[2]: %llu\n",
     460                                idx, sqe,
     461                                active_thread(),
     462                                (void*)sqe->user_data,
     463                                opcodes[sqe->opcode],
     464                                sqe->fd,
     465                                sqe->flags,
     466                                sqe->ioprio,
     467                                sqe->off,
     468                                sqe->addr,
     469                                sqe->len,
     470                                sqe->accept_flags,
     471                                sqe->splice_fd_in,
     472                                sqe->__pad2[0],
     473                                sqe->__pad2[1],
     474                                sqe->__pad2[2]
     475                        );
     476                }
     477
     478
    379479                // Get now the data we definetely need
    380480                volatile __u32 * const tail = ring.submit_q.tail;
     
    443543                                unlock(ring.submit_q.submit_lock);
    444544                        #endif
    445                         if( ret < 0 ) return;
     545                        if( ret < 0 ) {
     546                                return;
     547                        }
    446548
    447549                        // Release the consumed SQEs
     
    454556                                io.submit_q.submit_avg.cnt += 1;
    455557                        )
    456                 }
    457                 else {
     558
     559                        __cfadbg_print_safe( io, "Kernel I/O : submitted %u (among %u) for %p\n", idx, ret, active_thread() );
     560                }
     561                else
     562                {
    458563                        // get mutual exclusion
    459564                        #if defined(LEADER_LOCK)
     
    463568                        #endif
    464569
    465                         /* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 0,
     570                        /* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 3ul64,
    466571                        /* paranoid */  "index %u already reclaimed\n"
    467572                        /* paranoid */  "head %u, prev %u, tail %u\n"
     
    490595                        }
    491596
     597                        /* paranoid */ verify(ret == 1);
     598
    492599                        // update statistics
    493600                        __STATS__( false,
     
    496603                        )
    497604
     605                        {
     606                                __attribute__((unused)) volatile __u32 * const head = ring.submit_q.head;
     607                                __attribute__((unused)) __u32 last_idx = ring.submit_q.array[ ((*head) - 1) & mask ];
     608                                __attribute__((unused)) volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[last_idx];
     609
     610                                __cfadbg_print_safe( io,
     611                                        "Kernel I/O : last submitted is %u (%p)\n"
     612                                        "    data: %p\n"
     613                                        "    opcode: %s\n"
     614                                        "    fd: %d\n"
     615                                        "    flags: %d\n"
     616                                        "    prio: %d\n"
     617                                        "    off: %p\n"
     618                                        "    addr: %p\n"
     619                                        "    len: %d\n"
     620                                        "    other flags: %d\n"
     621                                        "    splice fd: %d\n"
     622                                        "    pad[0]: %llu\n"
     623                                        "    pad[1]: %llu\n"
     624                                        "    pad[2]: %llu\n",
     625                                        last_idx, sqe,
     626                                        (void*)sqe->user_data,
     627                                        opcodes[sqe->opcode],
     628                                        sqe->fd,
     629                                        sqe->flags,
     630                                        sqe->ioprio,
     631                                        sqe->off,
     632                                        sqe->addr,
     633                                        sqe->len,
     634                                        sqe->accept_flags,
     635                                        sqe->splice_fd_in,
     636                                        sqe->__pad2[0],
     637                                        sqe->__pad2[1],
     638                                        sqe->__pad2[2]
     639                                );
     640                        }
     641
     642                        __atomic_thread_fence( __ATOMIC_SEQ_CST );
    498643                        // Release the consumed SQEs
    499644                        __release_consumed_submission( ring );
     645                        // ring.submit_q.sqes[idx].user_data = 3ul64;
    500646
    501647                        #if defined(LEADER_LOCK)
     
    505651                        #endif
    506652
    507                         __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
     653                        __cfadbg_print_safe( io, "Kernel I/O : submitted %u for %p\n", idx, active_thread() );
    508654                }
    509655        }
    510656
    511657        // #define PARTIAL_SUBMIT 32
     658
     659        // go through the list of submissions in the ready array and moved them into
     660        // the ring's submit queue
    512661        static unsigned __collect_submitions( struct __io_data & ring ) {
    513662                /* paranoid */ verify( ring.submit_q.ready != 0p );
     
    550699        }
    551700
     701        // Go through the ring's submit queue and release everything that has already been consumed
     702        // by io_uring
    552703        static __u32 __release_consumed_submission( struct __io_data & ring ) {
    553704                const __u32 smask = *ring.submit_q.mask;
    554705
     706                // We need to get the lock to copy the old head and new head
    555707                if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
    556                 __u32 chead = *ring.submit_q.head;
    557                 __u32 phead = ring.submit_q.prev_head;
    558                 ring.submit_q.prev_head = chead;
     708                __attribute__((unused))
     709                __u32 ctail = *ring.submit_q.tail;        // get the current tail of the queue
     710                __u32 chead = *ring.submit_q.head;              // get the current head of the queue
     711                __u32 phead = ring.submit_q.prev_head;  // get the head the last time we were here
     712                ring.submit_q.prev_head = chead;                // note up to were we processed
    559713                unlock(ring.submit_q.release_lock);
    560714
     715                // the 3 fields are organized like this diagram
     716                // except it's are ring
     717                // ---+--------+--------+----
     718                // ---+--------+--------+----
     719                //    ^        ^        ^
     720                // phead    chead    ctail
     721
     722                // make sure ctail doesn't wrap around and reach phead
     723                /* paranoid */ verify(
     724                           (ctail >= chead && chead >= phead)
     725                        || (chead >= phead && phead >= ctail)
     726                        || (phead >= ctail && ctail >= chead)
     727                );
     728
     729                // find the range we need to clear
    561730                __u32 count = chead - phead;
     731
     732                // We acquired an previous-head/current-head range
     733                // go through the range and release the sqes
    562734                for( i; count ) {
    563735                        __u32 idx = ring.submit_q.array[ (phead + i) & smask ];
    564                         ring.submit_q.sqes[ idx ].user_data = 0;
     736
     737                        /* paranoid */ verify( 0 != ring.submit_q.sqes[ idx ].user_data );
     738                        __clean( &ring.submit_q.sqes[ idx ] );
    565739                }
    566740                return count;
    567741        }
     742
     743        void __sqe_clean( volatile struct io_uring_sqe * sqe ) {
     744                __clean( sqe );
     745        }
     746
     747        static inline void __clean( volatile struct io_uring_sqe * sqe ) {
     748                // If we are in debug mode, thrash the fields to make sure we catch reclamation errors
     749                __cfaabi_dbg_debug_do(
     750                        memset(sqe, 0xde, sizeof(*sqe));
     751                        sqe->opcode = (sizeof(opcodes) / sizeof(const char *)) - 1;
     752                );
     753
     754                // Mark the entry as unused
     755                __atomic_store_n(&sqe->user_data, 3ul64, __ATOMIC_SEQ_CST);
     756        }
    568757#endif
  • libcfa/src/concurrency/io/call.cfa.in

    r4468a70 r342af53  
    7474        ;
    7575
    76         extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
     76        extern [* volatile struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
    7777        extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
    7878
     
    222222                __u32 idx;
    223223                struct io_uring_sqe * sqe;
    224                 [sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
    225 
    226                 sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
     224                [(volatile struct io_uring_sqe *) sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
     225
    227226                sqe->opcode = IORING_OP_{op};
    228                 sqe->flags = sflags;{body}
     227                sqe->flags = sflags;
     228                sqe->ioprio = 0;
     229                sqe->fd = 0;
     230                sqe->off = 0;
     231                sqe->addr = 0;
     232                sqe->len = 0;
     233                sqe->fsync_flags = 0;
     234                sqe->__pad2[0] = 0;
     235                sqe->__pad2[1] = 0;
     236                sqe->__pad2[2] = 0;{body}
     237
     238                asm volatile("": : :"memory");
    229239
    230240                verify( sqe->user_data == (__u64)(uintptr_t)&future );
     
    312322        }),
    313323        # CFA_HAVE_IORING_OP_ACCEPT
    314         Call('ACCEPT4', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
     324        Call('ACCEPT', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
    315325                'fd': 'sockfd',
    316                 'addr': 'addr',
    317                 'addr2': 'addrlen',
     326                'addr': '(__u64)addr',
     327                'addr2': '(__u64)addrlen',
    318328                'accept_flags': 'flags'
    319329        }),
     
    464474
    465475print("""
     476//-----------------------------------------------------------------------------
     477bool cancel(io_cancellation & this) {
     478        #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_ASYNC_CANCEL)
     479                return false;
     480        #else
     481                io_future_t future;
     482
     483                io_context * context = __get_io_context();
     484
     485                __u8 sflags = 0;
     486                struct __io_data & ring = *context->thrd.ring;
     487
     488                __u32 idx;
     489                volatile struct io_uring_sqe * sqe;
     490                [sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
     491
     492                sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
     493                sqe->opcode = IORING_OP_ASYNC_CANCEL;
     494                sqe->flags = sflags;
     495                sqe->addr = this.target;
     496
     497                verify( sqe->user_data == (__u64)(uintptr_t)&future );
     498                __submit( context, idx );
     499
     500                wait(future);
     501
     502                if( future.result == 0 ) return true; // Entry found
     503                if( future.result == -EALREADY) return true; // Entry found but in progress
     504                if( future.result == -ENOENT ) return false; // Entry not found
     505                return false;
     506        #endif
     507}
     508
    466509//-----------------------------------------------------------------------------
    467510// Check if a function is has asynchronous
  • libcfa/src/concurrency/io/setup.cfa

    r4468a70 r342af53  
    5252                #include <pthread.h>
    5353                #include <sys/epoll.h>
     54                #include <sys/eventfd.h>
    5455                #include <sys/mman.h>
    5556                #include <sys/syscall.h>
     
    169170                // Main loop
    170171                while( iopoll.run ) {
     172                        __cfadbg_print_safe(io_core, "Kernel I/O - epoll : waiting on io_uring contexts\n");
     173
    171174                        // Wait for events
    172175                        int nfds = epoll_pwait( iopoll.epollfd, events, 10, -1, &mask );
     176
     177                        __cfadbg_print_safe(io_core, "Kernel I/O - epoll : %d io contexts events, waking up\n", nfds);
    173178
    174179                        // Check if an error occured
     
    181186                                $io_ctx_thread * io_ctx = ($io_ctx_thread *)(uintptr_t)events[i].data.u64;
    182187                                /* paranoid */ verify( io_ctx );
    183                                 __cfadbg_print_safe(io_core, "Kernel I/O : Unparking io poller %p\n", io_ctx);
     188                                __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Unparking io poller %d (%p)\n", io_ctx->ring->fd, io_ctx);
    184189                                #if !defined( __CFA_NO_STATISTICS__ )
    185190                                        __cfaabi_tls.this_stats = io_ctx->self.curr_cluster->stats;
    186191                                #endif
     192
     193                                eventfd_t v;
     194                                eventfd_read(io_ctx->ring->efd, &v);
     195
    187196                                post( io_ctx->sem );
    188197                        }
     
    233242                $thread & thrd = this.thrd.self;
    234243                if( cluster_context ) {
     244                        // We are about to do weird things with the threads
     245                        // we don't need interrupts to complicate everything
     246                        disable_interrupts();
     247
     248                        // Get cluster info
    235249                        cluster & cltr = *thrd.curr_cluster;
    236250                        /* paranoid */ verify( cltr.idles.total == 0 || &cltr == mainCluster );
     
    239253                        // We need to adjust the clean-up based on where the thread is
    240254                        if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
     255                                // This is the tricky case
     256                                // The thread was preempted or ready to run and now it is on the ready queue
     257                                // but the cluster is shutting down, so there aren't any processors to run the ready queue
     258                                // the solution is to steal the thread from the ready-queue and pretend it was blocked all along
    241259
    242260                                ready_schedule_lock();
    243 
    244                                         // This is the tricky case
    245                                         // The thread was preempted and now it is on the ready queue
     261                                        // The thread should on the list
     262                                        /* paranoid */ verify( thrd.link.next != 0p );
     263
     264                                        // Remove the thread from the ready queue of this cluster
    246265                                        // The thread should be the last on the list
    247                                         /* paranoid */ verify( thrd.link.next != 0p );
    248 
    249                                         // Remove the thread from the ready queue of this cluster
    250266                                        __attribute__((unused)) bool removed = remove_head( &cltr, &thrd );
    251267                                        /* paranoid */ verify( removed );
     
    263279                        }
    264280                        // !!! This is not an else if !!!
     281                        // Ok, now the thread is blocked (whether we cheated to get here or not)
    265282                        if( thrd.state == Blocked ) {
    266 
    267283                                // This is the "easy case"
    268284                                // The thread is parked and can easily be moved to active cluster
     
    274290                        }
    275291                        else {
    276 
    277292                                // The thread is in a weird state
    278293                                // I don't know what to do here
    279294                                abort("io_context poller thread is in unexpected state, cannot clean-up correctly\n");
    280295                        }
     296
     297                        // The weird thread kidnapping stuff is over, restore interrupts.
     298                        enable_interrupts( __cfaabi_dbg_ctx );
    281299                } else {
    282300                        post( this.thrd.sem );
     
    365383                }
    366384
     385                // Step 3 : Initialize the data structure
    367386                // Get the pointers from the kernel to fill the structure
    368387                // submit queue
     
    379398                        const __u32 num = *sq.num;
    380399                        for( i; num ) {
    381                                 sq.sqes[i].user_data = 0ul64;
     400                                __sqe_clean( &sq.sqes[i] );
    382401                        }
    383402                }
     
    409428                cq.cqes = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
    410429
     430                // Step 4 : eventfd
     431                int efd;
     432                for() {
     433                        efd = eventfd(0, 0);
     434                        if (efd < 0) {
     435                                if (errno == EINTR) continue;
     436                                abort("KERNEL ERROR: IO_URING EVENTFD - %s\n", strerror(errno));
     437                        }
     438                        break;
     439                }
     440
     441                int ret;
     442                for() {
     443                        ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &efd, 1);
     444                        if (ret < 0) {
     445                                if (errno == EINTR) continue;
     446                                abort("KERNEL ERROR: IO_URING EVENTFD REGISTER - %s\n", strerror(errno));
     447                        }
     448                        break;
     449                }
     450
    411451                // some paranoid checks
    412452                /* paranoid */ verifyf( (*cq.mask) == ((*cq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*cq.num) - 1ul32, *cq.num, *cq.mask  );
     
    423463                this.ring_flags = params.flags;
    424464                this.fd         = fd;
     465                this.efd        = efd;
    425466                this.eager_submits  = params_in.eager_submits;
    426467                this.poller_submits = params_in.poller_submits;
     
    445486                // close the file descriptor
    446487                close(this.fd);
     488                close(this.efd);
    447489
    448490                free( this.submit_q.ready ); // Maybe null, doesn't matter
     
    452494// I/O Context Sleep
    453495//=============================================================================================
    454 
    455         void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev) {
    456                 ev.events = EPOLLIN | EPOLLONESHOT;
     496        #define IOEVENTS EPOLLIN | EPOLLONESHOT
     497
     498        static inline void __ioctx_epoll_ctl($io_ctx_thread & ctx, int op, const char * error) {
     499                struct epoll_event ev;
     500                ev.events = IOEVENTS;
    457501                ev.data.u64 = (__u64)&ctx;
    458                 int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_ADD, ctx.ring->fd, &ev);
     502                int ret = epoll_ctl(iopoll.epollfd, op, ctx.ring->efd, &ev);
    459503                if (ret < 0) {
    460                         abort( "KERNEL ERROR: EPOLL ADD - (%d) %s\n", (int)errno, strerror(errno) );
    461                 }
    462         }
    463 
    464         void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev) {
    465                 int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_MOD, ctx.ring->fd, &ev);
    466                 if (ret < 0) {
    467                         abort( "KERNEL ERROR: EPOLL REARM - (%d) %s\n", (int)errno, strerror(errno) );
    468                 }
     504                        abort( "KERNEL ERROR: EPOLL %s - (%d) %s\n", error, (int)errno, strerror(errno) );
     505                }
     506        }
     507
     508        void __ioctx_register($io_ctx_thread & ctx) {
     509                __ioctx_epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
     510        }
     511
     512        void __ioctx_prepare_block($io_ctx_thread & ctx) {
     513                __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.ring->fd, &ctx);
     514                __ioctx_epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
    469515        }
    470516
  • libcfa/src/concurrency/io/types.hfa

    r4468a70 r342af53  
    6565
    6666                // A buffer of sqes (not the actual ring)
    67                 struct io_uring_sqe * sqes;
     67                volatile struct io_uring_sqe * sqes;
    6868
    6969                // The location and size of the mmaped area
     
    8585
    8686                // the kernel ring
    87                 struct io_uring_cqe * cqes;
     87                volatile struct io_uring_cqe * cqes;
    8888
    8989                // The location and size of the mmaped area
     
    9797                __u32 ring_flags;
    9898                int fd;
     99                int efd;
    99100                bool eager_submits:1;
    100101                bool poller_submits:1;
     
    130131        #endif
    131132
    132         struct epoll_event;
    133133        struct $io_ctx_thread;
    134         void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev);
    135         void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev);
     134        void __ioctx_register($io_ctx_thread & ctx);
     135        void __ioctx_prepare_block($io_ctx_thread & ctx);
     136        void __sqe_clean( volatile struct io_uring_sqe * sqe );
    136137#endif
    137138
  • libcfa/src/concurrency/monitor.hfa

    r4468a70 r342af53  
    132132
    133133              void wait        ( condition & this, uintptr_t user_info = 0 );
     134static inline bool is_empty    ( condition & this ) { return this.blocked.head == 1p; }
    134135              bool signal      ( condition & this );
    135136              bool signal_block( condition & this );
    136 static inline bool is_empty    ( condition & this ) { return this.blocked.head == 1p; }
     137static inline bool signal_all  ( condition & this ) { bool ret = false; while(!is_empty(this)) { ret = signal(this) || ret; } return ret; }
    137138         uintptr_t front       ( condition & this );
    138139
  • libcfa/src/heap.cfa

    r4468a70 r342af53  
    1010// Created On       : Tue Dec 19 21:58:35 2017
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Wed Dec 16 12:28:25 2020
    13 // Update Count     : 1023
     12// Last Modified On : Sun Jan 10 11:20:49 2021
     13// Update Count     : 1031
    1414//
    1515
     
    262262#ifdef __STATISTICS__
    263263// Heap statistics counters.
    264 static unsigned int malloc_calls;
     264static unsigned int malloc_zero_calls, malloc_calls;
    265265static unsigned long long int malloc_storage;
    266 static unsigned int aalloc_calls;
     266static unsigned int aalloc_zero_calls, aalloc_calls;
    267267static unsigned long long int aalloc_storage;
    268 static unsigned int calloc_calls;
     268static unsigned int calloc_zero_calls, calloc_calls;
    269269static unsigned long long int calloc_storage;
    270 static unsigned int memalign_calls;
     270static unsigned int memalign_zero_calls, memalign_calls;
    271271static unsigned long long int memalign_storage;
    272 static unsigned int amemalign_calls;
     272static unsigned int amemalign_zero_calls, amemalign_calls;
    273273static unsigned long long int amemalign_storage;
    274 static unsigned int cmemalign_calls;
     274static unsigned int cmemalign_zero_calls, cmemalign_calls;
    275275static unsigned long long int cmemalign_storage;
    276 static unsigned int resize_calls;
     276static unsigned int resize_zero_calls, resize_calls;
    277277static unsigned long long int resize_storage;
    278 static unsigned int realloc_calls;
     278static unsigned int realloc_zero_calls, realloc_calls;
    279279static unsigned long long int realloc_storage;
    280 static unsigned int free_calls;
     280static unsigned int free_zero_calls, free_calls;
    281281static unsigned long long int free_storage;
    282282static unsigned int mmap_calls;
     
    287287static unsigned long long int sbrk_storage;
    288288// Statistics file descriptor (changed by malloc_stats_fd).
    289 static int stat_fd = STDERR_FILENO;                                             // default stderr
     289static int stats_fd = STDERR_FILENO;                                    // default stderr
    290290
    291291// Use "write" because streams may be shutdown when calls are made.
     
    293293        char helpText[1024];
    294294        __cfaabi_bits_print_buffer( STDERR_FILENO, helpText, sizeof(helpText),
    295                                                                         "\nHeap statistics:\n"
    296                                                                         "  malloc: calls %u / storage %llu\n"
    297                                                                         "  aalloc: calls %u / storage %llu\n"
    298                                                                         "  calloc: calls %u / storage %llu\n"
    299                                                                         "  memalign: calls %u / storage %llu\n"
    300                                                                         "  amemalign: calls %u / storage %llu\n"
    301                                                                         "  cmemalign: calls %u / storage %llu\n"
    302                                                                         "  resize: calls %u / storage %llu\n"
    303                                                                         "  realloc: calls %u / storage %llu\n"
    304                                                                         "  free: calls %u / storage %llu\n"
    305                                                                         "  mmap: calls %u / storage %llu\n"
    306                                                                         "  munmap: calls %u / storage %llu\n"
    307                                                                         "  sbrk: calls %u / storage %llu\n",
    308                                                                         malloc_calls, malloc_storage,
    309                                                                         aalloc_calls, aalloc_storage,
    310                                                                         calloc_calls, calloc_storage,
    311                                                                         memalign_calls, memalign_storage,
    312                                                                         amemalign_calls, amemalign_storage,
    313                                                                         cmemalign_calls, cmemalign_storage,
    314                                                                         resize_calls, resize_storage,
    315                                                                         realloc_calls, realloc_storage,
    316                                                                         free_calls, free_storage,
    317                                                                         mmap_calls, mmap_storage,
    318                                                                         munmap_calls, munmap_storage,
    319                                                                         sbrk_calls, sbrk_storage
     295                                                                "\nHeap statistics:\n"
     296                                                                "  malloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     297                                                                "  aalloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     298                                                                "  calloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     299                                                                "  memalign  0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     300                                                                "  amemalign 0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     301                                                                "  cmemalign 0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     302                                                                "  resize    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     303                                                                "  realloc   0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     304                                                                "  free      0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
     305                                                                "  mmap      calls %'u; storage %'llu bytes\n"
     306                                                                "  munmap    calls %'u; storage %'llu bytes\n"
     307                                                                "  sbrk      calls %'u; storage %'llu bytes\n",
     308                                                                malloc_zero_calls, malloc_calls, malloc_storage,
     309                                                                aalloc_zero_calls, aalloc_calls, aalloc_storage,
     310                                                                calloc_zero_calls, calloc_calls, calloc_storage,
     311                                                                memalign_zero_calls, memalign_calls, memalign_storage,
     312                                                                amemalign_zero_calls, amemalign_calls, amemalign_storage,
     313                                                                cmemalign_zero_calls, cmemalign_calls, cmemalign_storage,
     314                                                                resize_zero_calls, resize_calls, resize_storage,
     315                                                                realloc_zero_calls, realloc_calls, realloc_storage,
     316                                                                free_zero_calls, free_calls, free_storage,
     317                                                                mmap_calls, mmap_storage,
     318                                                                munmap_calls, munmap_storage,
     319                                                                sbrk_calls, sbrk_storage
    320320                );
    321321} // printStats
     
    328328                                                "<sizes>\n"
    329329                                                "</sizes>\n"
    330                                                 "<total type=\"malloc\" count=\"%u\" size=\"%llu\"/>\n"
    331                                                 "<total type=\"aalloc\" count=\"%u\" size=\"%llu\"/>\n"
    332                                                 "<total type=\"calloc\" count=\"%u\" size=\"%llu\"/>\n"
    333                                                 "<total type=\"memalign\" count=\"%u\" size=\"%llu\"/>\n"
    334                                                 "<total type=\"amemalign\" count=\"%u\" size=\"%llu\"/>\n"
    335                                                 "<total type=\"cmemalign\" count=\"%u\" size=\"%llu\"/>\n"
    336                                                 "<total type=\"resize\" count=\"%u\" size=\"%llu\"/>\n"
    337                                                 "<total type=\"realloc\" count=\"%u\" size=\"%llu\"/>\n"
    338                                                 "<total type=\"free\" count=\"%u\" size=\"%llu\"/>\n"
    339                                                 "<total type=\"mmap\" count=\"%u\" size=\"%llu\"/>\n"
    340                                                 "<total type=\"munmap\" count=\"%u\" size=\"%llu\"/>\n"
    341                                                 "<total type=\"sbrk\" count=\"%u\" size=\"%llu\"/>\n"
     330                                                "<total type=\"malloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     331                                                "<total type=\"aalloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     332                                                "<total type=\"calloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     333                                                "<total type=\"memalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     334                                                "<total type=\"amemalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     335                                                "<total type=\"cmemalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     336                                                "<total type=\"resize\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     337                                                "<total type=\"realloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     338                                                "<total type=\"free\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     339                                                "<total type=\"mmap\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     340                                                "<total type=\"munmap\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     341                                                "<total type=\"sbrk\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    342342                                                "</malloc>",
    343                                                 malloc_calls, malloc_storage,
    344                                                 aalloc_calls, aalloc_storage,
    345                                                 calloc_calls, calloc_storage,
    346                                                 memalign_calls, memalign_storage,
    347                                                 amemalign_calls, amemalign_storage,
    348                                                 cmemalign_calls, cmemalign_storage,
    349                                                 resize_calls, resize_storage,
    350                                                 realloc_calls, realloc_storage,
    351                                                 free_calls, free_storage,
     343                                                malloc_zero_calls, malloc_calls, malloc_storage,
     344                                                aalloc_zero_calls, aalloc_calls, aalloc_storage,
     345                                                calloc_zero_calls, calloc_calls, calloc_storage,
     346                                                memalign_zero_calls, memalign_calls, memalign_storage,
     347                                                amemalign_zero_calls, amemalign_calls, amemalign_storage,
     348                                                cmemalign_zero_calls, cmemalign_calls, cmemalign_storage,
     349                                                resize_zero_calls, resize_calls, resize_storage,
     350                                                realloc_zero_calls, realloc_calls, realloc_storage,
     351                                                free_zero_calls, free_calls, free_storage,
    352352                                                mmap_calls, mmap_storage,
    353353                                                munmap_calls, munmap_storage,
     
    466466} // headers
    467467
    468 #ifdef __CFA_DEBUG__
    469 #if __SIZEOF_POINTER__ == 4
    470 #define MASK 0xdeadbeef
    471 #else
    472 #define MASK 0xdeadbeefdeadbeef
    473 #endif
    474 #define STRIDE size_t
    475 
    476 static void * Memset( void * addr, STRIDE size ) {              // debug only
    477         if ( size % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, size %zd not multiple of %zd.", size, sizeof(STRIDE) );
    478         if ( (STRIDE)addr % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, addr %p not multiple of %zd.", addr, sizeof(STRIDE) );
    479 
    480         STRIDE * end = (STRIDE *)addr + size / sizeof(STRIDE);
    481         for ( STRIDE * p = (STRIDE *)addr; p < end; p += 1 ) *p = MASK;
    482         return addr;
    483 } // Memset
    484 #endif // __CFA_DEBUG__
     468// #ifdef __CFA_DEBUG__
     469// #if __SIZEOF_POINTER__ == 4
     470// #define MASK 0xdeadbeef
     471// #else
     472// #define MASK 0xdeadbeefdeadbeef
     473// #endif
     474// #define STRIDE size_t
     475
     476// static void * Memset( void * addr, STRIDE size ) {           // debug only
     477//      if ( size % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, size %zd not multiple of %zd.", size, sizeof(STRIDE) );
     478//      if ( (STRIDE)addr % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, addr %p not multiple of %zd.", addr, sizeof(STRIDE) );
     479
     480//      STRIDE * end = (STRIDE *)addr + size / sizeof(STRIDE);
     481//      for ( STRIDE * p = (STRIDE *)addr; p < end; p += 1 ) *p = MASK;
     482//      return addr;
     483// } // Memset
     484// #endif // __CFA_DEBUG__
    485485
    486486
     
    498498                        unlock( extlock );
    499499                        __cfaabi_bits_print_nolock( STDERR_FILENO, NO_MEMORY_MSG, size );
    500                         _exit( EXIT_FAILURE );
    501                 } // if
     500                        _exit( EXIT_FAILURE );                                          // give up
     501                } // if
     502                // Make storage executable for thunks.
    502503                if ( mprotect( (char *)heapEnd + heapRemaining, increase, __map_prot ) ) {
    503504                        unlock( extlock );
     
    770771
    771772
    772 static inline void * callocNoStats( size_t dim, size_t elemSize ) {
    773         size_t size = dim * elemSize;
    774   if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
    775         char * addr = (char *)mallocNoStats( size );
    776 
    777         HeapManager.Storage.Header * header;
    778         HeapManager.FreeHeader * freeElem;
    779         size_t bsize, alignment;
    780         #ifndef __CFA_DEBUG__
    781         bool mapped =
    782         #endif // __CFA_DEBUG__
    783                 headers( "calloc", addr, header, freeElem, bsize, alignment );
    784         #ifndef __CFA_DEBUG__
    785 
    786         // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
    787         if ( ! mapped )
    788         #endif // __CFA_DEBUG__
    789                 // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
    790                 // `-header`-addr                      `-size
    791                 memset( addr, '\0', size );                                             // set to zeros
    792 
    793         header->kind.real.blockSize |= 2;                                       // mark as zero filled
    794         return addr;
    795 } // callocNoStats
    796 
    797 
    798773static inline void * memalignNoStats( size_t alignment, size_t size ) {
    799774  if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
     
    834809
    835810
    836 static inline void * cmemalignNoStats( size_t alignment, size_t dim, size_t elemSize ) {
    837         size_t size = dim * elemSize;
    838   if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
    839         char * addr = (char *)memalignNoStats( alignment, size );
    840 
    841         HeapManager.Storage.Header * header;
    842         HeapManager.FreeHeader * freeElem;
    843         size_t bsize;
    844         #ifndef __CFA_DEBUG__
    845         bool mapped =
    846         #endif // __CFA_DEBUG__
    847                 headers( "cmemalign", addr, header, freeElem, bsize, alignment );
    848 
    849         // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
    850         #ifndef __CFA_DEBUG__
    851         if ( ! mapped )
    852         #endif // __CFA_DEBUG__
    853                 // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
    854                 // `-header`-addr                      `-size
    855                 memset( addr, '\0', size );                                             // set to zeros
    856 
    857         header->kind.real.blockSize |= 2;                                       // mark as zero filled
    858         return addr;
    859 } // cmemalignNoStats
    860 
    861 
    862811extern "C" {
    863812        // Allocates size bytes and returns a pointer to the allocated memory.  The contents are undefined. If size is 0,
     
    865814        void * malloc( size_t size ) {
    866815                #ifdef __STATISTICS__
    867                 __atomic_add_fetch( &malloc_calls, 1, __ATOMIC_SEQ_CST );
    868                 __atomic_add_fetch( &malloc_storage, size, __ATOMIC_SEQ_CST );
     816                if ( likely( size > 0 ) ) {
     817                        __atomic_add_fetch( &malloc_calls, 1, __ATOMIC_SEQ_CST );
     818                        __atomic_add_fetch( &malloc_storage, size, __ATOMIC_SEQ_CST );
     819                } else {
     820                        __atomic_add_fetch( &malloc_zero_calls, 1, __ATOMIC_SEQ_CST );
     821                } // if
    869822                #endif // __STATISTICS__
    870823
     
    877830                size_t size = dim * elemSize;
    878831                #ifdef __STATISTICS__
    879                 __atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
    880                 __atomic_add_fetch( &aalloc_storage, size, __ATOMIC_SEQ_CST );
     832                if ( likely( size > 0 ) ) {
     833                        __atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
     834                        __atomic_add_fetch( &aalloc_storage, size, __ATOMIC_SEQ_CST );
     835                } else {
     836                        __atomic_add_fetch( &aalloc_zero_calls, 1, __ATOMIC_SEQ_CST );
     837                } // if
    881838                #endif // __STATISTICS__
    882839
     
    887844        // Same as aalloc() with memory set to zero.
    888845        void * calloc( size_t dim, size_t elemSize ) {
     846                size_t size = dim * elemSize;
     847          if ( unlikely( size ) == 0 ) {                        // 0 BYTE ALLOCATION RETURNS NULL POINTER
     848                        #ifdef __STATISTICS__
     849                        __atomic_add_fetch( &calloc_zero_calls, 1, __ATOMIC_SEQ_CST );
     850                        #endif // __STATISTICS__
     851                        return 0p;
     852                } // if
    889853                #ifdef __STATISTICS__
    890854                __atomic_add_fetch( &calloc_calls, 1, __ATOMIC_SEQ_CST );
     
    892856                #endif // __STATISTICS__
    893857
    894                 return callocNoStats( dim, elemSize );
     858                char * addr = (char *)mallocNoStats( size );
     859
     860                HeapManager.Storage.Header * header;
     861                HeapManager.FreeHeader * freeElem;
     862                size_t bsize, alignment;
     863
     864                #ifndef __CFA_DEBUG__
     865                bool mapped =
     866                        #endif // __CFA_DEBUG__
     867                        headers( "calloc", addr, header, freeElem, bsize, alignment );
     868
     869                #ifndef __CFA_DEBUG__
     870                // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
     871                if ( ! mapped )
     872                #endif // __CFA_DEBUG__
     873                        // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
     874                        // `-header`-addr                      `-size
     875                        memset( addr, '\0', size );                                     // set to zeros
     876
     877                header->kind.real.blockSize |= 2;                               // mark as zero filled
     878                return addr;
    895879        } // calloc
    896880
     
    901885        // call to malloc(), alloc(), calloc() or realloc(). If the area pointed to was moved, a free(oaddr) is done.
    902886        void * resize( void * oaddr, size_t size ) {
     887                // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
     888          if ( unlikely( size == 0 ) ) {                                        // special cases
     889                        #ifdef __STATISTICS__
     890                        __atomic_add_fetch( &resize_zero_calls, 1, __ATOMIC_SEQ_CST );
     891                        #endif // __STATISTICS__
     892                        free( oaddr );
     893                        return 0p;
     894                } // if
    903895                #ifdef __STATISTICS__
    904896                __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
    905897                #endif // __STATISTICS__
    906898
    907                 // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
    908           if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
    909899          if ( unlikely( oaddr == 0p ) ) {
    910900                        #ifdef __STATISTICS__
     
    918908                size_t bsize, oalign;
    919909                headers( "resize", oaddr, header, freeElem, bsize, oalign );
     910
    920911                size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
    921 
    922912                // same size, DO NOT preserve STICKY PROPERTIES.
    923913                if ( oalign == libAlign() && size <= odsize && odsize <= size * 2 ) { // allow 50% wasted storage for smaller size
     
    940930        // the old and new sizes.
    941931        void * realloc( void * oaddr, size_t size ) {
     932                // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
     933          if ( unlikely( size == 0 ) ) {                                        // special cases
     934                        #ifdef __STATISTICS__
     935                        __atomic_add_fetch( &realloc_zero_calls, 1, __ATOMIC_SEQ_CST );
     936                        #endif // __STATISTICS__
     937                        free( oaddr );
     938                        return 0p;
     939                } // if
    942940                #ifdef __STATISTICS__
    943941                __atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
    944942                #endif // __STATISTICS__
    945943
    946                 // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
    947           if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
    948944          if ( unlikely( oaddr == 0p ) ) {
    949945                        #ifdef __STATISTICS__
     
    999995        void * memalign( size_t alignment, size_t size ) {
    1000996                #ifdef __STATISTICS__
    1001                 __atomic_add_fetch( &memalign_calls, 1, __ATOMIC_SEQ_CST );
    1002                 __atomic_add_fetch( &memalign_storage, size, __ATOMIC_SEQ_CST );
     997                if ( likely( size > 0 ) ) {
     998                        __atomic_add_fetch( &memalign_calls, 1, __ATOMIC_SEQ_CST );
     999                        __atomic_add_fetch( &memalign_storage, size, __ATOMIC_SEQ_CST );
     1000                } else {
     1001                        __atomic_add_fetch( &memalign_zero_calls, 1, __ATOMIC_SEQ_CST );
     1002                } // if
    10031003                #endif // __STATISTICS__
    10041004
     
    10111011                size_t size = dim * elemSize;
    10121012                #ifdef __STATISTICS__
    1013                 __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
    1014                 __atomic_add_fetch( &cmemalign_storage, size, __ATOMIC_SEQ_CST );
     1013                if ( likely( size > 0 ) ) {
     1014                        __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
     1015                        __atomic_add_fetch( &cmemalign_storage, size, __ATOMIC_SEQ_CST );
     1016                } else {
     1017                        __atomic_add_fetch( &cmemalign_zero_calls, 1, __ATOMIC_SEQ_CST );
     1018                } // if
    10151019                #endif // __STATISTICS__
    10161020
     
    10211025        // Same as calloc() with memory alignment.
    10221026        void * cmemalign( size_t alignment, size_t dim, size_t elemSize ) {
     1027                size_t size = dim * elemSize;
     1028          if ( unlikely( size ) == 0 ) {                                        // 0 BYTE ALLOCATION RETURNS NULL POINTER
     1029                        #ifdef __STATISTICS__
     1030                        __atomic_add_fetch( &cmemalign_zero_calls, 1, __ATOMIC_SEQ_CST );
     1031                        #endif // __STATISTICS__
     1032                        return 0p;
     1033                } // if
    10231034                #ifdef __STATISTICS__
    10241035                __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
     
    10261037                #endif // __STATISTICS__
    10271038
    1028                 return cmemalignNoStats( alignment, dim, elemSize );
     1039                char * addr = (char *)memalignNoStats( alignment, size );
     1040
     1041                HeapManager.Storage.Header * header;
     1042                HeapManager.FreeHeader * freeElem;
     1043                size_t bsize;
     1044
     1045                #ifndef __CFA_DEBUG__
     1046                bool mapped =
     1047                        #endif // __CFA_DEBUG__
     1048                        headers( "cmemalign", addr, header, freeElem, bsize, alignment );
     1049
     1050                // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
     1051                #ifndef __CFA_DEBUG__
     1052                if ( ! mapped )
     1053                #endif // __CFA_DEBUG__
     1054                        // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
     1055                        // `-header`-addr                      `-size
     1056                        memset( addr, '\0', size );                                     // set to zeros
     1057
     1058                header->kind.real.blockSize |= 2;                               // mark as zero filled
     1059                return addr;
    10291060        } // cmemalign
    10301061
     
    10651096        // 0p, no operation is performed.
    10661097        void free( void * addr ) {
    1067                 #ifdef __STATISTICS__
    1068                 __atomic_add_fetch( &free_calls, 1, __ATOMIC_SEQ_CST );
    1069                 #endif // __STATISTICS__
    1070 
    10711098          if ( unlikely( addr == 0p ) ) {                                       // special case
     1099                        #ifdef __STATISTICS__
     1100                        __atomic_add_fetch( &free_zero_calls, 1, __ATOMIC_SEQ_CST );
     1101                        #endif // __STATISTICS__
     1102
    10721103                        // #ifdef __CFA_DEBUG__
    10731104                        // if ( traceHeap() ) {
     
    11821213        int malloc_stats_fd( int fd __attribute__(( unused )) ) {
    11831214                #ifdef __STATISTICS__
    1184                 int temp = stat_fd;
    1185                 stat_fd = fd;
     1215                int temp = stats_fd;
     1216                stats_fd = fd;
    11861217                return temp;
    11871218                #else
     
    12141245        // The string is printed on the file stream stream.  The exported string includes information about all arenas (see
    12151246        // malloc).
    1216         int malloc_info( int options, FILE * stream ) {
     1247        int malloc_info( int options, FILE * stream __attribute__(( unused )) ) {
    12171248          if ( options != 0 ) { errno = EINVAL; return -1; }
    12181249                #ifdef __STATISTICS__
     
    12431274// Must have CFA linkage to overload with C linkage realloc.
    12441275void * resize( void * oaddr, size_t nalign, size_t size ) {
    1245         #ifdef __STATISTICS__
    1246         __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
    1247         #endif // __STATISTICS__
     1276        // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
     1277  if ( unlikely( size == 0 ) ) {                                                // special cases
     1278                #ifdef __STATISTICS__
     1279                __atomic_add_fetch( &resize_zero_calls, 1, __ATOMIC_SEQ_CST );
     1280                #endif // __STATISTICS__
     1281                free( oaddr );
     1282                return 0p;
     1283        } // if
    12481284
    12491285        if ( unlikely( nalign < libAlign() ) ) nalign = libAlign(); // reset alignment to minimum
    12501286        #ifdef __CFA_DEBUG__
    1251         else
    1252                 checkAlign( nalign );                                                   // check alignment
     1287        else checkAlign( nalign );                                                      // check alignment
    12531288        #endif // __CFA_DEBUG__
    12541289
    1255         // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
    1256   if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
    12571290  if ( unlikely( oaddr == 0p ) ) {
    12581291                #ifdef __STATISTICS__
     1292                __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
    12591293                __atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
    12601294                #endif // __STATISTICS__
     
    13021336
    13031337void * realloc( void * oaddr, size_t nalign, size_t size ) {
     1338        // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
     1339  if ( unlikely( size == 0 ) ) {                                                // special cases
     1340                #ifdef __STATISTICS__
     1341                __atomic_add_fetch( &realloc_zero_calls, 1, __ATOMIC_SEQ_CST );
     1342                #endif // __STATISTICS__
     1343                free( oaddr );
     1344                return 0p;
     1345        } // if
     1346
    13041347        if ( unlikely( nalign < libAlign() ) ) nalign = libAlign(); // reset alignment to minimum
    13051348        #ifdef __CFA_DEBUG__
    1306         else
    1307                 checkAlign( nalign );                                                   // check alignment
     1349        else checkAlign( nalign );                                                      // check alignment
    13081350        #endif // __CFA_DEBUG__
    13091351
    1310         // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
    1311   if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
    13121352  if ( unlikely( oaddr == 0p ) ) {
    13131353                #ifdef __STATISTICS__
  • libcfa/src/startup.cfa

    r4468a70 r342af53  
    1010// Created On       : Tue Jul 24 16:21:57 2018
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Tue Feb  4 13:03:18 2020
    13 // Update Count     : 30
     12// Last Modified On : Sat Jan  9 23:18:23 2021
     13// Update Count     : 34
    1414//
    1515
    16 #include <time.h>                // tzset
    17 #include <locale.h>        // setlocale
     16#include <time.h>                                                                               // tzset
     17#include <locale.h>                                                                             // setlocale
     18#include <stdlib.h>                                                                             // getenv
    1819#include "startup.hfa"
    1920
     
    2223    void __cfaabi_appready_startup( void ) {
    2324                tzset();                                                                                // initialize time global variables
    24                 setlocale(LC_NUMERIC, "");
     25                setlocale( LC_NUMERIC, getenv("LANG") );
    2526                #ifdef __CFA_DEBUG__
    2627                extern void heapAppStart();
  • src/AST/Decl.cpp

    r4468a70 r342af53  
    1010// Created On       : Thu May 9 10:00:00 2019
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Fri Dec 13 16:23:15 2019
    13 // Update Count     : 20
     12// Last Modified On : Tue Jan 12 16:54:55 2021
     13// Update Count     : 23
    1414//
    1515
     
    7878
    7979const char * TypeDecl::typeString() const {
    80         static const char * kindNames[] = { "sized data type", "sized object type", "sized function type", "sized tuple type" };
    81         static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
     80        static const char * kindNames[] = { "sized data type", "sized data type", "sized object type", "sized function type", "sized tuple type", "sized array length type" };
     81        static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
    8282        assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
    8383        return sized ? kindNames[ kind ] : &kindNames[ kind ][ sizeof("sized") ]; // sizeof includes '\0'
     
    8585
    8686const char * TypeDecl::genTypeString() const {
    87         static const char * kindNames[] = { "dtype", "otype", "ftype", "ttype" };
    88         static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
     87        static const char * kindNames[] = { "T &", "T *", "T", "(*)", "T ...", "[T]" };
     88        static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
    8989        assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
    9090        return kindNames[ kind ];
  • src/AST/Decl.hpp

    r4468a70 r342af53  
    1010// Created On       : Thu May 9 10:00:00 2019
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Fri Dec 13 17:38:33 2019
    13 // Update Count     : 29
     12// Last Modified On : Mon Jan 11 20:48:38 2021
     13// Update Count     : 30
    1414//
    1515
     
    175175class TypeDecl final : public NamedTypeDecl {
    176176  public:
    177         enum Kind { Dtype, Otype, Ftype, Ttype, NUMBER_OF_KINDS };
     177        enum Kind { Dtype, DStype, Otype, Ftype, Ttype, ALtype, NUMBER_OF_KINDS };
    178178
    179179        Kind kind;
  • src/Parser/DeclarationNode.cc

    r4468a70 r342af53  
    1010// Created On       : Sat May 16 12:34:05 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Thu Oct  8 08:03:38 2020
    13 // Update Count     : 1135
     12// Last Modified On : Mon Jan 11 20:58:07 2021
     13// Update Count     : 1137
    1414//
    1515
     
    10751075        if ( variable.tyClass != TypeDecl::NUMBER_OF_KINDS ) {
    10761076                // otype is internally converted to dtype + otype parameters
    1077                 static const TypeDecl::Kind kindMap[] = { TypeDecl::Dtype, TypeDecl::Dtype, TypeDecl::Ftype, TypeDecl::Ttype };
    1078                 static_assert( sizeof(kindMap)/sizeof(kindMap[0]) == TypeDecl::NUMBER_OF_KINDS, "DeclarationNode::build: kindMap is out of sync." );
     1077                static const TypeDecl::Kind kindMap[] = { TypeDecl::Dtype, TypeDecl::DStype, TypeDecl::Dtype, TypeDecl::Ftype, TypeDecl::Ttype, TypeDecl::ALtype };
     1078                static_assert( sizeof(kindMap) / sizeof(kindMap[0]) == TypeDecl::NUMBER_OF_KINDS, "DeclarationNode::build: kindMap is out of sync." );
    10791079                assertf( variable.tyClass < sizeof(kindMap)/sizeof(kindMap[0]), "Variable's tyClass is out of bounds." );
    10801080                TypeDecl * ret = new TypeDecl( *name, Type::StorageClasses(), nullptr, kindMap[ variable.tyClass ], variable.tyClass == TypeDecl::Otype, variable.initializer ? variable.initializer->buildType() : nullptr );
  • src/Parser/ParseNode.h

    r4468a70 r342af53  
    1010// Created On       : Sat May 16 13:28:16 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sat Oct 24 03:53:54 2020
    13 // Update Count     : 895
     12// Last Modified On : Sun Jan  3 18:23:01 2021
     13// Update Count     : 896
    1414//
    1515
     
    3939struct DeclarationNode;
    4040class DeclarationWithType;
     41class Initializer;
    4142class ExpressionNode;
    42 class Initializer;
    4343struct StatementNode;
    4444
  • src/Parser/parser.yy

    r4468a70 r342af53  
    1010// Created On       : Sat Sep  1 20:22:55 2001
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sat Oct 24 08:21:14 2020
    13 // Update Count     : 4624
     12// Last Modified On : Mon Jan 11 21:32:10 2021
     13// Update Count     : 4633
    1414//
    1515
     
    329329%type<en> conditional_expression                constant_expression                     assignment_expression           assignment_expression_opt
    330330%type<en> comma_expression                              comma_expression_opt
    331 %type<en> argument_expression_list_opt  argument_expression                     default_initialize_opt
     331%type<en> argument_expression_list_opt  argument_expression                     default_initializer_opt
    332332%type<ifctl> if_control_expression
    333333%type<fctl> for_control_expression              for_control_expression_list
     
    424424%type<decl> sue_declaration_specifier sue_declaration_specifier_nobody sue_type_specifier sue_type_specifier_nobody
    425425
    426 %type<tclass> type_class
     426%type<tclass> type_class new_type_class
    427427%type<decl> type_declarator type_declarator_name type_declaring_list
    428428
     
    15451545        | cfa_function_declaration
    15461546        | type_declaring_list
     1547                { SemanticError( yylloc, "otype declaration is currently unimplemented." ); $$ = nullptr; }
    15471548        | trait_specifier
    15481549        ;
     
    22232224        ;
    22242225
    2225 cfa_parameter_ellipsis_list_opt:                                                        // CFA, abstract + real
     2226cfa_parameter_ellipsis_list_opt:                                                // CFA, abstract + real
    22262227        // empty
    22272228                { $$ = DeclarationNode::newBasicType( DeclarationNode::Void ); }
     
    22802281cfa_parameter_declaration:                                                              // CFA, new & old style parameter declaration
    22812282        parameter_declaration
    2282         | cfa_identifier_parameter_declarator_no_tuple identifier_or_type_name default_initialize_opt
     2283        | cfa_identifier_parameter_declarator_no_tuple identifier_or_type_name default_initializer_opt
    22832284                { $$ = $1->addName( $2 ); }
    2284         | cfa_abstract_tuple identifier_or_type_name default_initialize_opt
     2285        | cfa_abstract_tuple identifier_or_type_name default_initializer_opt
    22852286                // To obtain LR(1), these rules must be duplicated here (see cfa_abstract_declarator).
    22862287                { $$ = $1->addName( $2 ); }
    2287         | type_qualifier_list cfa_abstract_tuple identifier_or_type_name default_initialize_opt
     2288        | type_qualifier_list cfa_abstract_tuple identifier_or_type_name default_initializer_opt
    22882289                { $$ = $2->addName( $3 )->addQualifiers( $1 ); }
    22892290        | cfa_function_specifier
     
    23022303parameter_declaration:
    23032304                // No SUE declaration in parameter list.
    2304         declaration_specifier_nobody identifier_parameter_declarator default_initialize_opt
     2305        declaration_specifier_nobody identifier_parameter_declarator default_initializer_opt
    23052306                { $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
    2306         | declaration_specifier_nobody type_parameter_redeclarator default_initialize_opt
     2307        | declaration_specifier_nobody type_parameter_redeclarator default_initializer_opt
    23072308                { $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
    23082309        ;
    23092310
    23102311abstract_parameter_declaration:
    2311         declaration_specifier_nobody default_initialize_opt
     2312        declaration_specifier_nobody default_initializer_opt
    23122313                { $$ = $1->addInitializer( $2 ? new InitializerNode( $2 ) : nullptr ); }
    2313         | declaration_specifier_nobody abstract_parameter_declarator default_initialize_opt
     2314        | declaration_specifier_nobody abstract_parameter_declarator default_initializer_opt
    23142315                { $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
    23152316        ;
     
    24412442        type_class identifier_or_type_name
    24422443                { typedefTable.addToScope( *$2, TYPEDEFname, "9" ); }
    2443         type_initializer_opt assertion_list_opt
     2444          type_initializer_opt assertion_list_opt
    24442445                { $$ = DeclarationNode::newTypeParam( $1, $2 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
    2445         | type_specifier identifier_parameter_declarator
     2446        | identifier_or_type_name new_type_class
     2447                { typedefTable.addToScope( *$1, TYPEDEFname, "9" ); }
     2448          type_initializer_opt assertion_list_opt
     2449                { $$ = DeclarationNode::newTypeParam( $2, $1 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
     2450        | '[' identifier_or_type_name ']'
     2451                {
     2452                        typedefTable.addToScope( *$2, TYPEDEFname, "9" );
     2453                        $$ = DeclarationNode::newTypeParam( TypeDecl::ALtype, $2 );
     2454                }
     2455        // | type_specifier identifier_parameter_declarator
    24462456        | assertion_list
    24472457                { $$ = DeclarationNode::newTypeParam( TypeDecl::Dtype, new string( DeclarationNode::anonymous.newName() ) )->addAssertions( $1 ); }
     2458        ;
     2459
     2460new_type_class:                                                                                 // CFA
     2461        // empty
     2462                { $$ = TypeDecl::Otype; }
     2463        | '&'
     2464                { $$ = TypeDecl::Dtype; }
     2465        | '*'
     2466                { $$ = TypeDecl::DStype; }                                              // dtype + sized
     2467        | ELLIPSIS
     2468                { $$ = TypeDecl::Ttype; }
    24482469        ;
    24492470
     
    34763497        ;
    34773498
    3478 default_initialize_opt:
     3499default_initializer_opt:
    34793500        // empty
    34803501                { $$ = nullptr; }
  • src/ResolvExpr/ResolveAssertions.cc

    r4468a70 r342af53  
    397397
    398398        /// Limit to depth of recursion of assertion satisfaction
    399         static const int recursionLimit = 4;
     399        static const int recursionLimit = 7;
    400400        /// Maximum number of simultaneously-deferred assertions to attempt concurrent satisfaction of
    401401        static const int deferLimit = 10;
  • src/ResolvExpr/SatisfyAssertions.cpp

    r4468a70 r342af53  
    5757                ast::UniqueId resnSlot;          ///< Slot for any recursive assertion IDs
    5858
    59                 AssnCandidate( 
    60                         const ast::SymbolTable::IdData c, const ast::Type * at, ast::TypeEnvironment && e, 
     59                AssnCandidate(
     60                        const ast::SymbolTable::IdData c, const ast::Type * at, ast::TypeEnvironment && e,
    6161                        ast::AssertionSet && h, ast::AssertionSet && n, ast::OpenVarSet && o, ast::UniqueId rs )
    62                 : cdata( c ), adjType( at ), env( std::move( e ) ), have( std::move( h ) ), 
     62                : cdata( c ), adjType( at ), env( std::move( e ) ), have( std::move( h ) ),
    6363                  need( std::move( n ) ), open( std::move( o ) ), resnSlot( rs ) {}
    6464        };
     
    7373                const AssnCandidate & match;
    7474        };
    75        
    76         /// Wrapper for the deferred items from a single assertion satisfaction. 
     75
     76        /// Wrapper for the deferred items from a single assertion satisfaction.
    7777        /// Acts like an indexed list of DeferRef
    7878        struct DeferItem {
     
    8181                AssnCandidateList matches;
    8282
    83                 DeferItem( 
     83                DeferItem(
    8484                        const ast::VariableExpr * d, const ast::AssertionSetValue & i, AssnCandidateList && ms )
    8585                : expr( d ), info( i ), matches( std::move( ms ) ) {}
     
    117117                /// Initial satisfaction state for a candidate
    118118                SatState( CandidateRef & c, const ast::SymbolTable & syms )
    119                 : cand( c ), need(), newNeed(), deferred(), inferred(), costs{ Cost::zero }, 
     119                : cand( c ), need(), newNeed(), deferred(), inferred(), costs{ Cost::zero },
    120120                  symtab( syms ) { need.swap( c->need ); }
    121                
     121
    122122                /// Update satisfaction state for next step after previous state
    123123                SatState( SatState && o, IterateFlag )
    124                 : cand( std::move( o.cand ) ), need( o.newNeed.begin(), o.newNeed.end() ), newNeed(), 
    125                   deferred(), inferred( std::move( o.inferred ) ), costs( std::move( o.costs ) ), 
     124                : cand( std::move( o.cand ) ), need( o.newNeed.begin(), o.newNeed.end() ), newNeed(),
     125                  deferred(), inferred( std::move( o.inferred ) ), costs( std::move( o.costs ) ),
    126126                  symtab( o.symtab ) { costs.emplace_back( Cost::zero ); }
    127                
     127
    128128                /// Field-wise next step constructor
    129129                SatState(
    130                         CandidateRef && c, ast::AssertionSet && nn, InferCache && i, CostVec && cs, 
     130                        CandidateRef && c, ast::AssertionSet && nn, InferCache && i, CostVec && cs,
    131131                        ast::SymbolTable && syms )
    132                 : cand( std::move( c ) ), need( nn.begin(), nn.end() ), newNeed(), deferred(), 
     132                : cand( std::move( c ) ), need( nn.begin(), nn.end() ), newNeed(), deferred(),
    133133                  inferred( std::move( i ) ), costs( std::move( cs ) ), symtab( std::move( syms ) )
    134134                  { costs.emplace_back( Cost::zero ); }
     
    143143
    144144        /// Binds a single assertion, updating satisfaction state
    145         void bindAssertion( 
    146                 const ast::VariableExpr * expr, const ast::AssertionSetValue & info, CandidateRef & cand, 
     145        void bindAssertion(
     146                const ast::VariableExpr * expr, const ast::AssertionSetValue & info, CandidateRef & cand,
    147147                AssnCandidate & match, InferCache & inferred
    148148        ) {
    149149                const ast::DeclWithType * candidate = match.cdata.id;
    150                 assertf( candidate->uniqueId, 
     150                assertf( candidate->uniqueId,
    151151                        "Assertion candidate does not have a unique ID: %s", toString( candidate ).c_str() );
    152                
     152
    153153                ast::Expr * varExpr = match.cdata.combine( cand->expr->location, cand->cvtCost );
    154154                varExpr->result = match.adjType;
     
    201201                        ast::OpenVarSet newOpen{ sat.cand->open };
    202202                        ast::ptr< ast::Type > toType = assn.first->result;
    203                         ast::ptr< ast::Type > adjType = 
     203                        ast::ptr< ast::Type > adjType =
    204204                                renameTyVars( adjustExprType( candidate->get_type(), newEnv, sat.symtab ), GEN_USAGE, false );
    205205
     
    213213                                }
    214214
    215                                 matches.emplace_back( 
     215                                matches.emplace_back(
    216216                                        cdata, adjType, std::move( newEnv ), std::move( have ), std::move( newNeed ),
    217217                                        std::move( newOpen ), crntResnSlot );
     
    287287        };
    288288
    289         /// Replace ResnSlots with InferParams and add alternative to output list, if it meets pruning 
     289        /// Replace ResnSlots with InferParams and add alternative to output list, if it meets pruning
    290290        /// threshold.
    291         void finalizeAssertions( 
    292                 CandidateRef & cand, InferCache & inferred, PruneMap & thresholds, CostVec && costs, 
    293                 CandidateList & out 
     291        void finalizeAssertions(
     292                CandidateRef & cand, InferCache & inferred, PruneMap & thresholds, CostVec && costs,
     293                CandidateList & out
    294294        ) {
    295295                // prune if cheaper alternative for same key has already been generated
     
    308308        }
    309309
    310         /// Combo iterator that combines candidates into an output list, merging their environments. 
    311         /// Rejects an appended candidate if environments cannot be merged. See `Common/FilterCombos.h` 
     310        /// Combo iterator that combines candidates into an output list, merging their environments.
     311        /// Rejects an appended candidate if environments cannot be merged. See `Common/FilterCombos.h`
    312312        /// for description of "combo iterator".
    313313        class CandidateEnvMerger {
     
    390390} // anonymous namespace
    391391
    392 void satisfyAssertions( 
    393         CandidateRef & cand, const ast::SymbolTable & symtab, CandidateList & out, 
     392void satisfyAssertions(
     393        CandidateRef & cand, const ast::SymbolTable & symtab, CandidateList & out,
    394394        std::vector<std::string> & errors
    395395) {
     
    418418
    419419                        // should a limit be imposed? worst case here is O(n^2) but very unlikely to happen.
    420                         for (unsigned resetCount = 0; ; ++resetCount) { 
     420                        for (unsigned resetCount = 0; ; ++resetCount) {
    421421                                ast::AssertionList next;
    422422                                resetTyVarRenaming();
     
    449449                                // either add successful match or push back next state
    450450                                if ( sat.newNeed.empty() ) {
    451                                         finalizeAssertions( 
     451                                        finalizeAssertions(
    452452                                                sat.cand, sat.inferred, thresholds, std::move( sat.costs ), out );
    453453                                } else {
     
    471471                                std::vector< CandidateEnvMerger::OutType > compatible = filterCombos(
    472472                                        sat.deferred, CandidateEnvMerger{ sat.cand->env, sat.cand->open, sat.symtab } );
    473                                
     473
    474474                                // fail early if no mutually-compatible assertion satisfaction
    475475                                if ( compatible.empty() ) {
     
    494494                                        // set up next satisfaction state
    495495                                        CandidateRef nextCand = std::make_shared<Candidate>(
    496                                                 sat.cand->expr, std::move( compat.env ), std::move( compat.open ), 
     496                                                sat.cand->expr, std::move( compat.env ), std::move( compat.open ),
    497497                                                ast::AssertionSet{} /* need moved into satisfaction state */,
    498498                                                sat.cand->cost, sat.cand->cvtCost );
     
    500500                                        ast::AssertionSet nextNewNeed{ sat.newNeed };
    501501                                        InferCache nextInferred{ sat.inferred };
    502                                        
     502
    503503                                        CostVec nextCosts{ sat.costs };
    504504                                        nextCosts.back() += compat.cost;
    505                                                                
     505
    506506                                        ast::SymbolTable nextSymtab{ sat.symtab };
    507507
     
    517517                                        // either add successful match or push back next state
    518518                                        if ( nextNewNeed.empty() ) {
    519                                                 finalizeAssertions( 
     519                                                finalizeAssertions(
    520520                                                        nextCand, nextInferred, thresholds, std::move( nextCosts ), out );
    521521                                        } else {
    522                                                 nextSats.emplace_back( 
    523                                                         std::move( nextCand ), std::move( nextNewNeed ), 
    524                                                         std::move( nextInferred ), std::move( nextCosts ), 
     522                                                nextSats.emplace_back(
     523                                                        std::move( nextCand ), std::move( nextNewNeed ),
     524                                                        std::move( nextInferred ), std::move( nextCosts ),
    525525                                                        std::move( nextSymtab ) );
    526526                                        }
     
    534534                nextSats.clear();
    535535        }
    536        
     536
    537537        // exceeded recursion limit if reaches here
    538538        if ( out.empty() ) {
  • src/SymTab/Demangle.cc

    r4468a70 r342af53  
    1010// Created On       : Thu Jul 19 12:52:41 2018
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Tue Feb 11 15:09:18 2020
    13 // Update Count     : 10
     12// Last Modified On : Mon Jan 11 21:28:27 2021
     13// Update Count     : 11
    1414//
    1515
     
    367367                                // type variable types
    368368                                for (size_t k = 0; k < TypeDecl::NUMBER_OF_KINDS; ++k) {
    369                                         static const std::string typeVariableNames[] = { "DT", "OT", "FT", "TT", };
     369                                        static const std::string typeVariableNames[] = { "DT", "DST", "OT", "FT", "TT", "ALT", };
    370370                                        static_assert(
    371371                                                sizeof(typeVariableNames)/sizeof(typeVariableNames[0]) == TypeDecl::NUMBER_OF_KINDS,
  • src/SymTab/Mangler.cc

    r4468a70 r342af53  
    1010// Created On       : Sun May 17 21:40:29 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Wed Nov 18 12:01:38 2020
    13 // Update Count     : 64
     12// Last Modified On : Mon Jan 11 21:56:06 2021
     13// Update Count     : 74
    1414//
    1515#include "Mangler.h"
     
    313313                                // and the case has not yet come up in practice. Alternatively, if not then this code can be removed
    314314                                // aside from the assert false.
    315                                 assertf(false, "Mangler_old should not visit typedecl: %s", toCString(decl));
     315                                assertf( false, "Mangler_old should not visit typedecl: %s", toCString(decl));
    316316                                assertf( decl->kind < TypeDecl::NUMBER_OF_KINDS, "Unhandled type variable kind: %d", decl->kind );
    317317                                mangleName += Encoding::typeVariables[ decl->kind ] + std::to_string( decl->name.length() ) + decl->name;
     
    343343                                                        break;
    344344                                                  default:
    345                                                         assert( false );
     345                                                        assertf( false, "unimplemented kind for type variable %s", SymTab::Mangler::Encoding::typeVariables[i->kind].c_str() );
    346346                                                } // switch
    347347                                                varNums[ i->name ] = std::make_pair( nextVarNum, (int)i->kind );
     
    673673                                        for ( auto & decl : ptype->forall ) {
    674674                                                switch ( decl->kind ) {
    675                                                 case ast::TypeDecl::Kind::Dtype:
     675                                                  case ast::TypeDecl::Kind::Dtype:
    676676                                                        dcount++;
    677677                                                        break;
    678                                                 case ast::TypeDecl::Kind::Ftype:
     678                                                  case ast::TypeDecl::Kind::Ftype:
    679679                                                        fcount++;
    680680                                                        break;
    681                                                 case ast::TypeDecl::Kind::Ttype:
     681                                                  case ast::TypeDecl::Kind::Ttype:
    682682                                                        vcount++;
    683683                                                        break;
    684                                                 default:
    685                                                         assert( false );
     684                                                  default:
     685                                                        assertf( false, "unimplemented kind for type variable %s", SymTab::Mangler::Encoding::typeVariables[decl->kind].c_str() );
    686686                                                } // switch
    687687                                                varNums[ decl->name ] = std::make_pair( nextVarNum, (int)decl->kind );
  • src/SymTab/ManglerCommon.cc

    r4468a70 r342af53  
    1010// Created On       : Sun May 17 21:44:03 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Fri Dec 13 14:54:38 2019
    13 // Update Count     : 28
     12// Last Modified On : Mon Jan 11 21:23:10 2021
     13// Update Count     : 29
    1414//
    1515
     
    104104                        const std::string typeVariables[] = {
    105105                                "BD", // dtype
     106                                "BDS", // dtype + sized
    106107                                "BO", // otype
    107108                                "BF", // ftype
    108109                                "BT", // ttype
     110                                "BAL", // array length type
    109111                        };
    110112                        static_assert(
    111                                 sizeof(typeVariables)/sizeof(typeVariables[0]) == TypeDecl::NUMBER_OF_KINDS,
     113                                sizeof(typeVariables) / sizeof(typeVariables[0]) == TypeDecl::NUMBER_OF_KINDS,
    112114                                "Each type variable kind should have a corresponding mangler prefix"
    113115                        );
  • src/SynTree/Declaration.h

    r4468a70 r342af53  
    1010// Created On       : Mon May 18 07:44:20 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Fri Dec 13 23:11:22 2019
    13 // Update Count     : 157
     12// Last Modified On : Mon Jan 11 20:48:39 2021
     13// Update Count     : 158
    1414//
    1515
     
    201201        typedef NamedTypeDecl Parent;
    202202  public:
    203         enum Kind { Dtype, Otype, Ftype, Ttype, NUMBER_OF_KINDS };
     203        enum Kind { Dtype, DStype, Otype, Ftype, Ttype, ALtype, NUMBER_OF_KINDS };
    204204
    205205        Kind kind;
  • src/SynTree/TypeDecl.cc

    r4468a70 r342af53  
    1010// Created On       : Mon May 18 07:44:20 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Thu Oct  8 18:18:55 2020
    13 // Update Count     : 22
     12// Last Modified On : Tue Jan 12 16:07:33 2021
     13// Update Count     : 26
    1414//
    1515
     
    3333
    3434const char * TypeDecl::typeString() const {
    35         static const char * kindNames[] = { "sized data type", "sized object type", "sized function type", "sized tuple type" };
    36         static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
     35        static const char * kindNames[] = { "sized data type", "sized data type", "sized object type", "sized function type", "sized tuple type", "sized array length type" };
     36        static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
    3737        assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
    3838        return isComplete() ? kindNames[ kind ] : &kindNames[ kind ][ sizeof("sized") ]; // sizeof includes '\0'
     
    4040
    4141const char * TypeDecl::genTypeString() const {
    42         static const char * kindNames[] = { "dtype", "otype", "ftype", "ttype" };
    43         static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
     42        static const char * kindNames[] = { "T &", "T *", "T", "(*)", "T ...", "[T]" };
     43        static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
    4444        assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
    4545        return kindNames[ kind ];
  • tests/Makefile.am

    r4468a70 r342af53  
    103103
    104104mostlyclean-local :
     105        find ${builddir} -not -path './__pycache__/*' -path '*.o' -delete
     106        find ${builddir} -not -path './__pycache__/*' -path '*/.err/*.log' -delete
     107        find ${builddir} -not -path './__pycache__/*' -path '*/.out/*.log' -delete
    105108        rm -f ${EXTRA_PROGRAMS}
    106109        rm -rf __pycache__
    107         find ${builddir} -path '*.o' -delete
    108         find ${builddir} -path '*/.err/*.log' -delete
    109         find ${builddir} -path '*/.out/*.log' -delete
    110110
    111111distclean-local :
  • tests/concurrent/futures/.expect/basic.txt

    r4468a70 r342af53  
     1start
    12done
  • tests/concurrent/futures/basic.cfa

    r4468a70 r342af53  
    11#include <thread.hfa>
     2
    23enum {NFUTURES = 10};
    34
     
    8384
    8485int main() {
     86        printf( "start\n" );                            // non-empty .expect file
    8587        processor procs[2];
    8688        {
Note: See TracChangeset for help on using the changeset viewer.