Changes in / [342af53:4468a70]


Ignore:
Files:
2 added
39 deleted
45 edited

Legend:

Unmodified
Added
Removed
  • Jenkins/FullBuild

    r342af53 r4468a70  
    108108                        string(name: 'GitRef', value: commitId),        \
    109109                        string(name: 'Build' , value: buildNum) \
    110                 ],                                                              \
    111                 propagate: false
     110                ]
    112111
    113112        echo(result.result)
  • benchmark/creation/node_cor.js

    r342af53 r4468a70  
    66function * coroutine() { yield }
    77
    8 for ( var i = 0; i < times; i += 1 ) { // warm JIT
     8for ( var i = 0; i < times; i += 1 ) { // warm jit
    99        cor = coroutine()
    1010}
  • benchmark/ctxswitch/node_cor.js

    r342af53 r4468a70  
    1111cor = coroutine()
    1212
    13 for ( var i = 0; i < times; i += 1 ) { // warm JIT
     13for ( var i = 0; i < times; i += 1 ) { // warm git
    1414        cor.next();
    1515}
  • benchmark/io/http/Makefile.am

    r342af53 r4468a70  
    2929EXTRA_PROGRAMS = httpforall .dummy_hack
    3030
    31 CLEANFILES = httpforall
    32 
    3331nodist_httpforall_SOURCES = \
    3432        filecache.cfa \
  • benchmark/io/http/main.cfa

    r342af53 r4468a70  
    4646}
    4747
    48 extern void init_protocol(void);
    49 extern void deinit_protocol(void);
    50 
    5148//=============================================================================================
    5249// Main
     
    6461        //===================
    6562        // Open Socket
    66         printf("%ld : Listening on port %d\n", getpid(), options.socket.port);
     63        printf("Listening on port %d\n", options.socket.port);
    6764        int server_fd = socket(AF_INET, SOCK_STREAM, 0);
    6865        if(server_fd < 0) {
     
    8279                ret = bind( server_fd, (struct sockaddr *)&address, sizeof(address) );
    8380                if(ret < 0) {
    84                         if(errno == EADDRINUSE) {
     81                        if(errno == 98) {
    8582                                if(waited == 0) {
    8683                                        printf("Waiting for port\n");
     
    112109                options.clopts.instance = &cl;
    113110
    114 
    115111                int pipe_cnt = options.clopts.nworkers * 2;
    116112                int pipe_off;
     
    128124                {
    129125                        ServerProc procs[options.clopts.nprocs];
    130 
    131                         init_protocol();
    132126                        {
    133127                                Worker workers[options.clopts.nworkers];
     
    157151                                        printf("Shutting Down\n");
    158152                                }
    159 
    160                                 for(i; options.clopts.nworkers) {
    161                                         printf("Cancelling %p\n", (void*)workers[i].cancel.target);
    162                                         workers[i].done = true;
    163                                         cancel(workers[i].cancel);
    164                                 }
    165 
    166                                 printf("Shutting down socket\n");
    167                                 int ret = shutdown( server_fd, SHUT_RD );
    168                                 if( ret < 0 ) { abort( "shutdown error: (%d) %s\n", (int)errno, strerror(errno) ); }
    169 
    170                                 //===================
    171                                 // Close Socket
    172                                 printf("Closing Socket\n");
    173                                 ret = close( server_fd );
    174                                 if(ret < 0) {
    175                                         abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
    176                                 }
    177153                        }
    178154                        printf("Workers Closed\n");
    179 
    180                         deinit_protocol();
    181155                }
    182156
     
    188162                }
    189163                free(fds);
     164        }
    190165
     166        //===================
     167        // Close Socket
     168        printf("Closing Socket\n");
     169        ret = close( server_fd );
     170        if(ret < 0) {
     171                abort( "close socket error: (%d) %s\n", (int)errno, strerror(errno) );
    191172        }
    192173
  • benchmark/io/http/options.cfa

    r342af53 r4468a70  
    1212#include <parseargs.hfa>
    1313
    14 #include <string.h>
    15 
    1614Options options @= {
    17         false, // log
    18 
    1915        { // file_cache
    2016                0,     // open_flags;
     
    5248                {'p', "port",           "Port the server will listen on", options.socket.port},
    5349                {'c', "cpus",           "Number of processors to use", options.clopts.nprocs},
    54                 {'L', "log",            "Enable logs", options.log, parse_settrue},
    5550                {'t', "threads",        "Number of worker threads to use", options.clopts.nworkers},
    5651                {'b', "accept-backlog", "Maximum number of pending accepts", options.socket.backlog},
  • benchmark/io/http/options.hfa

    r342af53 r4468a70  
    88
    99struct Options {
    10         bool log;
    11 
    1210        struct {
    1311                int open_flags;
  • benchmark/io/http/protocol.cfa

    r342af53 r4468a70  
    1818#include "options.hfa"
    1919
    20 const char * volatile date = 0p;
    21 
    2220const char * http_msgs[] = {
    23         "HTTP/1.1 200 OK\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: %zu \n\n",
    24         "HTTP/1.1 400 Bad Request\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
    25         "HTTP/1.1 404 Not Found\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
    26         "HTTP/1.1 413 Payload Too Large\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
    27         "HTTP/1.1 414 URI Too Long\nServer: HttoForall\nDate: %s \nContent-Type: text/plain\nContent-Length: 0 \n\n",
     21        "HTTP/1.1 200 OK\nContent-Type: text/plain\nContent-Length: %zu\n\n",
     22        "HTTP/1.1 400 Bad Request\nContent-Type: text/plain\nContent-Length: 0\n\n",
     23        "HTTP/1.1 404 Not Found\nContent-Type: text/plain\nContent-Length: 0\n\n",
     24        "HTTP/1.1 413 Payload Too Large\nContent-Type: text/plain\nContent-Length: 0\n\n",
     25        "HTTP/1.1 414 URI Too Long\nContent-Type: text/plain\nContent-Length: 0\n\n",
    2826};
    2927
     
    4745        while(len > 0) {
    4846                // Call write
    49                 int ret = cfa_write(fd, it, len, 0, -1`s, 0p, 0p);
    50                 // int ret = write(fd, it, len);
     47                int ret = write(fd, it, len);
    5148                if( ret < 0 ) { if( errno != EAGAIN && errno != EWOULDBLOCK) abort( "'answer error' error: (%d) %s\n", (int)errno, strerror(errno) ); }
    5249
     
    6663int answer_header( int fd, size_t size ) {
    6764        const char * fmt = http_msgs[OK200];
    68         int len = 200;
     65        int len = 100;
    6966        char buffer[len];
    70         len = snprintf(buffer, len, fmt, date, size);
     67        len = snprintf(buffer, len, fmt, size);
    7168        return answer( fd, buffer, len );
    7269}
    7370
    74 int answer_plain( int fd, char buffer[], size_t size ) {
    75         int ret = answer_header(fd, size);
    76         if( ret < 0 ) return ret;
    77         return answer(fd, buffer, size);
    78 }
    79 
    80 int answer_empty( int fd ) {
    81         return answer_header(fd, 0);
    82 }
    83 
    84 
    85 [HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len, io_cancellation * cancel) {
     71[HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len) {
    8672        char * it = buffer;
    8773        size_t count = len - 1;
     
    8975        READ:
    9076        for() {
    91                 int ret = cfa_read(fd, (void*)it, count, 0, -1`s, cancel, 0p);
    92                 // int ret = read(fd, (void*)it, count);
     77                int ret = cfa_read(fd, (void*)it, count, 0, -1`s, 0p, 0p);
    9378                if(ret == 0 ) return [OK200, true, 0, 0];
    9479                if(ret < 0 ) {
    9580                        if( errno == EAGAIN || errno == EWOULDBLOCK) continue READ;
    96                         // if( errno == EINVAL ) return [E400, true, 0, 0];
    9781                        abort( "read error: (%d) %s\n", (int)errno, strerror(errno) );
    9882                }
     
    10892        }
    10993
    110         if( options.log ) printf("%.*s\n", rlen, buffer);
     94        printf("%.*s\n", rlen, buffer);
    11195
    11296        it = buffer;
     
    120104
    121105void sendfile( int pipe[2], int fd, int ans_fd, size_t count ) {
    122         unsigned sflags = SPLICE_F_MOVE; // | SPLICE_F_MORE;
    123106        off_t offset = 0;
    124107        ssize_t ret;
    125108        SPLICE1: while(count > 0) {
    126                 ret = cfa_splice(ans_fd, &offset, pipe[1], 0p, count, sflags, 0, -1`s, 0p, 0p);
    127                 // ret = splice(ans_fd, &offset, pipe[1], 0p, count, sflags);
     109                ret = cfa_splice(ans_fd, &offset, pipe[1], 0p, count, SPLICE_F_MOVE | SPLICE_F_MORE, 0, -1`s, 0p, 0p);
    128110                if( ret < 0 ) {
    129111                        if( errno != EAGAIN && errno != EWOULDBLOCK) continue SPLICE1;
     
    135117                size_t in_pipe = ret;
    136118                SPLICE2: while(in_pipe > 0) {
    137                         ret = cfa_splice(pipe[0], 0p, fd, 0p, in_pipe, sflags, 0, -1`s, 0p, 0p);
    138                         // ret = splice(pipe[0], 0p, fd, 0p, in_pipe, sflags);
     119                        ret = cfa_splice(pipe[0], 0p, fd, 0p, in_pipe, SPLICE_F_MOVE | SPLICE_F_MORE, 0, -1`s, 0p, 0p);
    139120                        if( ret < 0 ) {
    140121                                if( errno != EAGAIN && errno != EWOULDBLOCK) continue SPLICE2;
     
    146127        }
    147128}
    148 
    149 //=============================================================================================
    150 
    151 #include <clock.hfa>
    152 #include <time.hfa>
    153 #include <thread.hfa>
    154 
    155 struct date_buffer {
    156         char buff[100];
    157 };
    158 
    159 thread DateFormater {
    160         int idx;
    161         date_buffer buffers[2];
    162 };
    163 
    164 void ?{}( DateFormater & this ) {
    165         ((thread&)this){ "Server Date Thread", *options.clopts.instance };
    166         this.idx = 0;
    167         memset( this.buffers[0].buff, 0, sizeof(this.buffers[0]) );
    168         memset( this.buffers[1].buff, 0, sizeof(this.buffers[1]) );
    169 }
    170 
    171 void main(DateFormater & this) {
    172         LOOP: for() {
    173                 waitfor( ^?{} : this) {
    174                         break LOOP;
    175                 }
    176                 or else {}
    177 
    178                 Time now = getTimeNsec();
    179 
    180                 strftime( this.buffers[this.idx].buff, 100, "%a, %d %b %Y %H:%M:%S %Z", now );
    181 
    182                 char * next = this.buffers[this.idx].buff;
    183                 __atomic_exchange_n((char * volatile *)&date, next, __ATOMIC_SEQ_CST);
    184                 this.idx = (this.idx + 1) % 2;
    185 
    186                 sleep(1`s);
    187         }
    188 }
    189 
    190 //=============================================================================================
    191 DateFormater * the_date_formatter;
    192 
    193 void init_protocol(void) {
    194         the_date_formatter = alloc();
    195         (*the_date_formatter){};
    196 }
    197 
    198 void deinit_protocol(void) {
    199         ^(*the_date_formatter){};
    200         free( the_date_formatter );
    201 }
  • benchmark/io/http/protocol.hfa

    r342af53 r4468a70  
    11#pragma once
    2 
    3 struct io_cancellation;
    42
    53enum HttpCode {
     
    1614int answer_error( int fd, HttpCode code );
    1715int answer_header( int fd, size_t size );
    18 int answer_plain( int fd, char buffer [], size_t size );
    19 int answer_empty( int fd );
    2016
    21 [HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len, io_cancellation *);
     17[HttpCode code, bool closed, * const char file, size_t len] http_read(int fd, []char buffer, size_t len);
    2218
    2319void sendfile( int pipe[2], int fd, int ans_fd, size_t count );
  • benchmark/io/http/worker.cfa

    r342af53 r4468a70  
    1919        this.pipe[0] = -1;
    2020        this.pipe[1] = -1;
    21         this.done = false;
    22 }
    23 
    24 extern "C" {
    25 extern int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags);
    2621}
    2722
     
    3328        CONNECTION:
    3429        for() {
    35                 if( options.log ) printf("=== Accepting connection ===\n");
    36                 int fd = cfa_accept4( this.[sockfd, addr, addrlen, flags], 0, -1`s, &this.cancel, 0p );
    37                 // int fd = accept4( this.[sockfd, addr, addrlen, flags] );
     30                int fd = cfa_accept4( this.[sockfd, addr, addrlen, flags], 0, -1`s, 0p, 0p );
    3831                if(fd < 0) {
    3932                        if( errno == ECONNABORTED ) break;
    40                         if( errno == EINVAL && this.done ) break;
    4133                        abort( "accept error: (%d) %s\n", (int)errno, strerror(errno) );
    4234                }
    4335
    44                 if( options.log ) printf("=== New connection %d, waiting for requests ===\n", fd);
     36                printf("New connection %d, waiting for requests\n", fd);
    4537                REQUEST:
    4638                for() {
     
    5345                        size_t len = options.socket.buflen;
    5446                        char buffer[len];
    55                         if( options.log ) printf("=== Reading request ===\n");
    56                         [code, closed, file, name_size] = http_read(fd, buffer, len, &this.cancel);
     47                        printf("Reading request\n");
     48                        [code, closed, file, name_size] = http_read(fd, buffer, len);
    5749
    5850                        // if we are done, break out of the loop
    5951                        if( closed ) {
    60                                 if( options.log ) printf("=== Connection closed ===\n");
    61                                 close(fd);
     52                                printf("Connection closed\n");
    6253                                continue CONNECTION;
    6354                        }
     
    6556                        // If this wasn't a request retrun 400
    6657                        if( code != OK200 ) {
    67                                 printf("=== Invalid Request : %d ===\n", code_val(code));
     58                                printf("Invalid Request : %d\n", code_val(code));
    6859                                answer_error(fd, code);
    6960                                continue REQUEST;
    7061                        }
    7162
    72                         if(0 == strncmp(file, "plaintext", min(name_size, sizeof("plaintext") ))) {
    73                                 if( options.log ) printf("=== Request for /plaintext ===\n");
    74 
    75                                 char text[] = "Hello, World!\n";
    76 
    77                                 // Send the header
    78                                 answer_plain(fd, text, sizeof(text));
    79 
    80                                 if( options.log ) printf("=== Answer sent ===\n");
    81                                 continue REQUEST;
    82                         }
    83 
    84                         if(0 == strncmp(file, "ping", min(name_size, sizeof("ping") ))) {
    85                                 if( options.log ) printf("=== Request for /ping ===\n");
    86 
    87                                 // Send the header
    88                                 answer_empty(fd);
    89 
    90                                 if( options.log ) printf("=== Answer sent ===\n");
    91                                 continue REQUEST;
    92                         }
    93 
    94                         if( options.log ) printf("=== Request for file %.*s ===\n", (int)name_size, file);
     63                        printf("Request for file %.*s\n", (int)name_size, file);
    9564
    9665                        // Get the fd from the file cache
     
    10170                        // If we can't find the file, return 404
    10271                        if( ans_fd < 0 ) {
    103                                 printf("=== File Not Found ===\n");
     72                                printf("File Not Found\n");
    10473                                answer_error(fd, E404);
    10574                                continue REQUEST;
     
    11281                        sendfile( this.pipe, fd, ans_fd, count);
    11382
    114                         if( options.log ) printf("=== Answer sent ===\n");
     83                        printf("File sent\n");
    11584                }
    11685        }
  • benchmark/io/http/worker.hfa

    r342af53 r4468a70  
    1717        socklen_t * addrlen;
    1818        int flags;
    19         io_cancellation cancel;
    20         volatile bool done;
    2119};
    2220void ?{}( Worker & this);
  • benchmark/io/readv.cfa

    r342af53 r4468a70  
    9696
    9797        char **left;
    98         parse_args( opt, opt_cnt, "[OPTIONS]...\ncforall readv benchmark", left );
     98        parse_args( opt, opt_cnt, "[OPTIONS]...\ncforall yield benchmark", left );
    9999
    100100        if(kpollcp || odirect) {
    101101                if( (buflen % 512) != 0 ) {
    102102                        fprintf(stderr, "Buffer length must be a multiple of 512 when using O_DIRECT, was %lu\n\n", buflen);
    103                         print_args_usage(opt, opt_cnt, "[OPTIONS]...\ncforall readv benchmark", true);
     103                        print_args_usage(opt, opt_cnt, "[OPTIONS]...\ncforall yield benchmark", true);
    104104                }
    105105        }
  • doc/bibliography/pl.bib

    r342af53 r4468a70  
    990990}
    991991
    992 @techreport{cfa-cc,
    993     keywords    = {Cforall, cfa-cc, transpiler},
    994     contributer = {pabuhr@plg},
    995     title       = {{\textsf{cfa-cc}} Developer's Reference Manual},
    996     author      = {Fangren Yu},
    997     institution = {School of Computer Science},
    998     address     = {University of Waterloo, Waterloo, Ontario, Canada},
    999     month       = aug,
    1000     year        = {2020},
    1001     note        = {\href{https://cforall.uwaterloo.ca/doc/Fangren_Yu_Report_S20.pdf}{https://\-cforall.uwaterloo.ca/\-doc/\-Fangren\_Yu\_Report\_S20.pdf}},
    1002 }
    1003 
    1004992@article{Moss18,
    1005993    keywords    = {type systems, polymorphism, tuples, Cforall},
     
    10521040    keywords    = {type system, generic type, resolution algorithm, type environment, Cforall},
    10531041    author      = {Aaron Moss},
    1054     title       = {\textsf{C}\,$\mathbf{\forall}$ Type System Implementation},
     1042    title       = {\textsf{C}$\mathbf{\forall}$ Type System Implementation},
    10551043    school      = {School of Computer Science, University of Waterloo},
    10561044    year        = 2019,
     
    11731161    keywords    = {ctrie, concurrent map},
    11741162    contributer = {a3moss@uwaterloo.ca},
    1175     title       = {Cache-aware lock-free concurrent hash tries},
    1176     author      = {Prokopec, Aleksandar and Bagwell, Phil and Odersky, Martin},
    1177     institution = {EPFL},
    1178     year        = {2011}
     1163    title       ={Cache-aware lock-free concurrent hash tries},
     1164    author      ={Prokopec, Aleksandar and Bagwell, Phil and Odersky, Martin},
     1165    institution ={EPFL},
     1166    year        ={2011}
    11791167}
    11801168
     
    39403928    school      = {School of Computer Science, University of Waterloo},
    39413929    year        = 2003,
    3942     optaddress  = {Waterloo, Ontario, Canada, N2L 3G1},
     3930    address     = {Waterloo, Ontario, Canada, N2L 3G1},
    39433931    note        = {\href{http://plg.uwaterloo.ca/theses/BilsonThesis.pdf}{http://\-plg.uwaterloo.ca/\-theses/\-BilsonThesis.pdf}},
    39443932}
     
    52225210}
    52235211
    5224 @manual{gcc-nested-func,
    5225     keywords    = {gcc nested functions},
    5226     contributer = {pabuhr@plg},
    5227     key         = {gcc nested functions},
    5228     title       = {Nested Functions},
    5229     organization= {{gcc} 9.3 Manual},
    5230     year        = 2019,
    5231     note        = {\href{https://gcc.gnu.org/onlinedocs/gcc-9.3.0/gcc/Nested-Functions.html}{https://\-gcc.gnu.org/\-onlinedocs/\-gcc-9.3.0/\-gcc/\-Nested-Functions.html}},
    5232 }
    5233 
    52345212@article{Haddon77,
    52355213    keywords    = {monitors, nested monitor calls},
     
    74357413}
    74367414
    7437 @manual{POSIX17,
    7438     keywords    = {POSIX, Standard},
    7439     contributer = {pabuhr@plg},
    7440     key         = {POSIX},
    7441     title       = {1003.1 Standard for Information Technology -- Portable Operating System Interface (POSIX), Base Specifications, Issue 7},
    7442     organization= {IEEE and The Open Group},
    7443     year        = 2017,
    7444     note        = {\href{https://pubs.opengroup.org/onlinepubs/9699919799}{https://\-pubs.opengroup.org/\-onlinepubs/\-9699919799}},
    7445 }
    7446 
    74477415@inproceedings{ML:NJ,
    74487416    keywords    = {continuations, ML},
  • doc/theses/andrew_beach_MMath/existing.tex

    r342af53 r4468a70  
    8383the the call site.
    8484
    85 As an example, even if no function named \codeCFA{do_once} is not defined
    86 near the definition of \codeCFA{do_twice} the following code will work.
     85As an example, even if no function named \codeCFA{do\_once} is not defined
     86near the definition of \codeCFA{do\_twice} the following code will work.
    8787\begin{lstlisting}
    8888int quadruple(int x) {
     
    9595\end{lstlisting}
    9696This is not the recommended way to implement a quadruple function but it
    97 does work. The complier will deduce that \codeCFA{do_twice}'s T is an
     97does work. The complier will deduce that \codeCFA{do\_twice}'s T is an
    9898integer from the argument. It will then look for a definition matching the
    99 assertion which is the \codeCFA{do_once} defined within the function. That
    100 function will be passed in as a function pointer to \codeCFA{do_twice} and
     99assertion which is the \codeCFA{do\_once} defined within the function. That
     100function will be passed in as a function pointer to \codeCFA{do\_twice} and
    101101called within it.
    102102
     
    156156In \CFA coroutines are created using the \codeCFA{coroutine} keyword which
    157157works just like \codeCFA{struct} except that the created structure will be
    158 modified by the compiler to satify the \codeCFA{is_coroutine} trait.
     158modified by the compiler to satify the \codeCFA{is\_coroutine} trait.
    159159
    160160These structures act as the interface between callers and the coroutine,
  • doc/theses/fangren_yu_COOP_S20/Report.tex

    r342af53 r4468a70  
    5656
    5757\title{\Huge
    58 \lstinline|cfa-cc| Developer's Reference
     58cfa-cc Developer's Reference
    5959}% title
    6060
     
    728728The \CFA compiler sets a limit on assertion depth and reports an error if assertion resolution does not terminate within the limit (as for \lstinline[language=C++]@templates@ in \CC).
    729729
    730 \addcontentsline{toc}{section}{\refname}
    731730\bibliographystyle{plain}
    732731\bibliography{pl}
  • doc/theses/thierry_delisle_PhD/thesis/Makefile

    r342af53 r4468a70  
    88BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex
    99
    10 MAKEFLAGS = --no-print-directory --silent
     10MAKEFLAGS = --no-print-directory --silent #
    1111VPATH = ${Build} ${Figures}
    1212
     
    6666
    6767build/%.dvi : %.tex Makefile | ${Build}
    68         # Conditionally create an empty *.ind (index) file for inclusion until makeindex is run.
    69         if [ ! -r ${basename $@}.ind ] ; then touch ${basename $@}.ind ; fi
    7068        # Must have *.aux file containing citations for bibtex
    7169        if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} $< ; fi
     
    7674        # Make index from *.aux entries and input index at end of document
    7775        -makeglossaries -q -s ${basename $@}.ist ${basename $@}
    78         # Make index from *.aux entries and input index at end of document
    79         -makeindex ${basename $@}.idx
    8076        # Run again to finish citations
    8177        ${LaTeX} $<
  • doc/theses/thierry_delisle_PhD/thesis/text/core.tex

    r342af53 r4468a70  
    11\chapter{Scheduling Core}\label{core}
    22
    3 Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers. In short, the system is neither overloaded nor underloaded.
     3Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenerio, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the ressources necessary to accomplish the work, \eg, enough workers. In short, the system is neither overloaded nor underloaded.
    44
    5 I believe it is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new load and return to the steady state, \eg, by adding or removing workers. Therefore, flaws in scheduling the steady state can to be pervasive in all states.
     5I believe it is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to the new load and return to the steady state, \eg, adding or removing workers. Flaws in the scheduling when the system is in the steady state can therefore to be pervasive in all states.
    66
    77\section{Design Goals}
    8 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental-model. To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental-model, the system also respects this model.
     8As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer, according to their probable mental model. To match these expectations, the design must offer the programmers sufficient guarantees so that, as long as they respect the mental model, the system will also respect this model.
    99
    10 For threading, a simple and common execution mental-model is the ``Ideal multi-tasking CPU'' :
     10For threading, a simple and common mental model is the ``Ideal multi-tasking CPU'' :
    1111
    1212\begin{displayquote}[Linux CFS\cit{https://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt}]
    1313        {[The]} ``Ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 tasks running, then it runs each at 50\% physical power --- i.e., actually in parallel.
    14         \label{q:LinuxCFS}
    1514\end{displayquote}
    1615
    1716Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
    1817
    19 In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with each other but simply share the hardware. This assumption makes it easier to reason about threading because ready \glspl{thrd} can be thought of in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees:
     18In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with eachother but simply share the hardware. This makes it easier to reason about threading because ready \glspl{thrd} can be taken in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees:
    2019\begin{enumerate}
    21         \item A fairness guarantee: a \gls{thrd} that is ready to run is not prevented by another thread.
    22         \item A performance guarantee: a \gls{thrd} that wants to start or stop running is not prevented by other threads wanting to do the same.
     20        \item A fairness guarantee: a \gls{thrd} that is ready to run will not be prevented to do so by another thread.
     21        \item A performance guarantee: a \gls{thrd} that wants to start or stop running will not be slowed down by other threads wanting to do the same.
    2322\end{enumerate}
    2423
    25 It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still share the limited hardware resources. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware resources, even if that share is very small.
     24It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still need to share a limited amount of hardware. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware, even if that share is very small.
    2625
    27 Similarly the performance guarantee, the lack of interference among threads, is only relevant up to a point. Ideally, the cost of running and blocking should be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document, the performance experimentation attempts to show the cost of scheduling is at worst equivalent to existing algorithms used in popular languages. This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models. Recall programmer expectation is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is equivalent to or lower than other popular languages, I consider the guarantee achieved.
     26Similarly the performance guarantee, the lack of interferance between threads, is only relevant up to a point. Ideally the cost of running and blocking would be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document the performance experimentation will attempt to show that the cost of scheduling is at worst equivalent to existing algorithms used in popular languages. This demonstration can be made by comparing application built in \CFA to applications built with other languages or other models. Recall from a few paragraphs ago that the expectation of programmers is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is equivalent or lower to other popular languages, I will consider the guarantee achieved.
    2827
    2928More precisely the scheduler should be:
    3029\begin{itemize}
    3130        \item As fast as other schedulers that are less fair.
    32         \item Faster than other schedulers that have equal or better fairness.
     31        \item Faster than other scheduler that have equal or better fairness.
    3332\end{itemize}
    3433
    3534\subsection{Fairness vs Scheduler Locality}
    36 An important performance factor in modern architectures is cache locality. Waiting for data at lower levels or not present in the cache can have a major impact on performance. Having multiple \glspl{hthrd} writing to the same cache lines also leads to cache lines that must be waited on. It is therefore preferable to divide data among each \gls{hthrd}\footnote{This partitioning can be an explicit division up front or using data structures where different \glspl{hthrd} are naturally routed to different cache lines.}.
     35An important performance factor in modern architectures is cache locality. Waiting for data not present in the cache can have a major impact on performance, and having multiple \glspl{hthrd} writing to the same cache lines can lead to cache lines that need to be waited on again. It is therefore preferable to divide the data among each \gls{hthrd}\footnote{This can be an explicit division up front or using data structures where different \glspl{hthrd} are naturally routed to different cache lines.}.
    3736
    38 For a scheduler, having good locality\footnote{This section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling. External locality is a much more complicated subject and is discussed in part~\ref{Evaluation} on evaluation.}, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness. Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available.
     37For a scheduler, having good locality\footnote{This section discusses \emph{internal} locality, \ie, the locality of the data used by the scheduler. \emph{External locality}, \ie, how the data used by the application is affected by scheduling, is a much more complicated subject and will be discussed in the chapters on evaluation.}, \ie, having the data be local to each \gls{hthrd}, generally conflicts with fairness. Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving \gls{thrd}, and as a consequence cache lines, to \glspl{hthrd} that are currently more appropriate.
    3938
    40 However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally, where Figure~\ref{fig:fair} shows a visual representation of this behaviour. As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental-model.
     39However, I claim that in practice it is possible to strike a balance between fairness and performance because the need for these do not necessarily overlap temporaly. Figure~\ref{fig:fair} shows an visual representation of this behaviour. As mentionned, a little bit of unfairness can be acceptable, therefore it can be desirable to have an algorithm that prioritizes cache locality as long as no threads is left behind for too long.
    4140
    4241\begin{figure}
    43         \centering
    44         \input{fairness.pstex_t}
    45         \vspace*{-10pt}
    46         \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache, Locality, decreases. At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model. Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.}
     42        \begin{center}
     43                \input{fairness.pstex_t}
     44        \end{center}
     45        \caption{Fairness vs Locality}
    4746        \label{fig:fair}
     47        Rule of thumb graph: Importance of Fairness and Locality while a ready \gls{thrd} waits run.
     48        As the time a ready \gls{thrd} waits increases, ``Ready Time'', the chances that its data is still in cache decreases. At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model mentionned above. Since the actual values and curves of this graph can be highly variable, the graph is left intentionally fuzzy and innacurate.
    4849\end{figure}
    4950
    5051\section{Design}
    51 In general, a na\"{i}ve \glsxtrshort{fifo} ready-queue does not scale with increased parallelism from \glspl{hthrd}, resulting in decreased performance. The problem is adding/removing \glspl{thrd} is a single point of contention. As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}. The common solution to the single point of contention is to shard the ready-queue so each \gls{hthrd} can access the ready-queue without contention, increasing performance though lack of contention.
     52A naive strictly \glsxtrshort{fifo} ready-queue does not offer sufficient performance. As shown in the evaluation sections, most production schedulers scale when adding multiple \glspl{hthrd} and that is not possible with a single point of contention. Therefore it is vital to shard the ready-queue so that multiple \glspl{hthrd} can access the ready-queue without performance degradation.
    5253
    5354\subsection{Sharding} \label{sec:sharding}
    54 An interesting approach to sharding a queue is presented in \cit{Trevors paper}. This algorithm presents a queue with a relaxed \glsxtrshort{fifo} guarantee using an array of strictly \glsxtrshort{fifo} sublists as shown in Figure~\ref{fig:base}. Each \emph{cell} of the array has a timestamp for the last operation and a pointer to a linked-list with a lock and each node in the list is marked with a timestamp indicating when it is added to the list. A push operation is done by picking a random cell, acquiring the list lock, and pushing to the list. If the cell is locked, the operation is simply retried on another random cell until a lock is acquired. A pop operation is done in a similar fashion except two random cells are picked. If both cells are unlocked with non-empty lists, the operation pops the node with the oldest cell timestamp. If one of the cells is unlocked and non-empty, the operation pops from that cell. If both cells are either locked or empty, the operation picks two new random cells and tries again.
     55An interesting approach to sharding a queue is presented in \cit{Trevors paper}. This algorithm represents a queue with relaxed \glsxtrshort{fifo} guarantee using an array of strictly \glsxtrshort{fifo} sublists as shown in Figure~\ref{fig:base}. Each cell of the array contains a linked-list with a lock and each node in these list is marked with a timestamp indicating when they were added to the list. Push operations are done by picking a random cell and attempting to push to its list. If the cell is already locked, the operation is simply retried on a new cell until a lock is acquired. Pop operations are done in a similar fashion except two random cells are picked. If both cells are not already locked and both cells contain non-empty lists, the operation pops the node with the oldest timestamp. If only one of the cell is unlocked and non-empty, the operation pops from that cell. If both cells are either locked or empty, the operation picks two new cells and tries again.
    5556
    5657\begin{figure}
    57         \centering
    58         \input{base.pstex_t}
    59         \caption[Relaxed FIFO list]{Relaxed FIFO list \smallskip\newline List at the base of the scheduler: an array of strictly FIFO lists. The timestamp is in all nodes and cell arrays.}
     58        \begin{center}
     59                \input{base.pstex_t}
     60        \end{center}
     61        \caption{Relaxed FIFO list}
    6062        \label{fig:base}
     63        List at the base of the scheduler: an array of strictly FIFO lists.
     64        The timestamp is in all nodes and cell arrays.
    6165\end{figure}
    6266
    6367\subsection{Finding threads}
    64 Once threads have been distributed onto multiple queues, identifying empty queues becomes a problem. Indeed, if the number of \glspl{thrd} does not far exceed the number of queues, it is probable that several of the cell queues are empty. Figure~\ref{fig:empty} shows an example with 2 \glspl{thrd} running on 8 queues, where the chances of getting an empty queue is 75\% per pick, meaning two random picks yield a \gls{thrd} only half the time. This scenario leads to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
     68Once threads have been distributed onto multiple queues, indentifying which queues are empty and which are not can become a problem. Indeed, if the number of \glspl{thrd} does not far exceed the number of queues, it is probable that several of these queues are empty. Figure~\ref{fig:empty} shows an example with 2 \glspl{thrd} running on 8 queues, where the chances of getting an empty queue is 75\% per pick, meaning two random picks yield a \gls{thrd} only half the time.
     69
    6570
    6671\begin{figure}
    67         \centering
    68         \input{empty.pstex_t}
    69         \caption[``More empty'' Relaxed FIFO list]{``More empty'' Relaxed FIFO list \smallskip\newline Emptier state of the queue: the array contains many empty cells, that is strictly FIFO lists containing no elements.}
     72        \begin{center}
     73                \input{empty.pstex_t}
     74        \end{center}
     75        \caption{``More empty'' Relaxed FIFO list}
    7076        \label{fig:empty}
     77        Emptier state of the queue: the array contains many empty cells, that is strictly FIFO lists containing no elements.
    7178\end{figure}
    7279
    73 There are several solutions to this problem, but they ultimately all have to encode if a cell has an empty list. My results show the density and locality of this encoding is generally the dominating factor in these scheme. Classic solutions to this problem use one of three techniques to encode the information:
     80This can lead to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
    7481
    75 \paragraph{Dense Information} Figure~\ref{fig:emptybit} shows a dense bitmask to identify the cell queues currently in use. This approach means processors can often find \glspl{thrd} in constant time, regardless of how many underlying queues are empty. Furthermore, modern x86 CPUs have extended bit manipulation instructions (BMI2) that allow searching the bitmask with very little overhead compared to the randomized selection approach for a filled ready queue, offering good performance even in cases with many empty inner queues. However, this technique has its limits: with a single word\footnote{Word refers here to however many bits can be written atomically.} bitmask, the total amount of ready-queue sharding is limited to the number of bits in the word. With a multi-word bitmask, this maximum limit can be increased arbitrarily, but the look-up time increases. Finally, a dense bitmap, either single or multi-word, causes additional contention problems that reduces performance because of cache misses after updates. This central update bottleneck also means the information in the bitmask is more often stale before a processor can use it to find an item, \ie mask read says there are available \glspl{thrd} but none on queue when the subsequent atomic check is done.
     82Solutions to this problem can take many forms, but they ultimately all have to encode where the threads are in some form. My results show that the density and locality of this encoding is generally the dominating factor in these scheme. Classic solutions to this problem use one of three techniques to encode the information:
    7683
    7784\begin{figure}
    78         \centering
     85        \begin{center}
     86                {\resizebox{0.73\textwidth}{!}{\input{emptybit.pstex_t}}}
     87        \end{center}
    7988        \vspace*{-5pt}
    80         {\resizebox{0.75\textwidth}{!}{\input{emptybit.pstex_t}}}
     89        \caption{Underloaded queue with added bitmask to indicate which array cells have items.}
     90        \label{fig:emptybit}
     91        \begin{center}
     92                {\resizebox{0.73\textwidth}{!}{\input{emptytree.pstex_t}}}
     93        \end{center}
    8194        \vspace*{-5pt}
    82         \caption[Underloaded queue with bitmask]{Underloaded queue with bitmask indicating array cells with items.}
    83         \label{fig:emptybit}
    84 
    85         \vspace*{10pt}
    86         {\resizebox{0.75\textwidth}{!}{\input{emptytree.pstex_t}}}
     95        \caption{Underloaded queue with added binary search tree indicate which array cells have items.}
     96        \label{fig:emptytree}
     97        \begin{center}
     98                {\resizebox{0.9\textwidth}{!}{\input{emptytls.pstex_t}}}
     99        \end{center}
    87100        \vspace*{-5pt}
    88         \caption[Underloaded queue with binary search-tree]{Underloaded queue with binary search-tree indicating array cells with items.}
    89         \label{fig:emptytree}
    90 
    91         \vspace*{10pt}
    92         {\resizebox{0.95\textwidth}{!}{\input{emptytls.pstex_t}}}
    93         \vspace*{-5pt}
    94         \caption[Underloaded queue with per processor bitmask]{Underloaded queue with per processor bitmask indicating array cells with items.}
     101        \caption{Underloaded queue with added per processor bitmask to indicate which array cells have items.}
    95102        \label{fig:emptytls}
    96103\end{figure}
    97104
    98 \paragraph{Sparse Information} Figure~\ref{fig:emptytree} shows an approach using a hierarchical tree data-structure to reduce contention and has been shown to work in similar cases~\cite{ellen2007snzi}. However, this approach may lead to poorer performance due to the inherent pointer chasing cost while still allowing significant contention on the nodes of the tree if the tree is shallow.
     105\paragraph{Dense Information} Figure~\ref{fig:emptybit} shows a dense bitmask to identify which inner queues are currently in use. This approach means processors can often find \glspl{thrd} in constant time, regardless of how many underlying queues are empty. Furthermore, modern x86 CPUs have extended bit manipulation instructions (BMI2) that allow using the bitmask with very little overhead compared to the randomized selection approach for a filled ready queue, offering good performance even in cases with many empty inner queues. However, this technique has its limits: with a single word\footnote{Word refers here to however many bits can be written atomically.} bitmask, the total number of underlying queues in the ready queue is limited to the number of bits in the word. With a multi-word bitmask, this maximum limit can be increased arbitrarily, but the look-up will nolonger be constant time. Finally, a dense bitmap, either single or multi-word, causes additional contention problems which reduces performance because of cache misses after updates. This central update bottleneck also means the information in the bitmask is more often stale before a processor can use it to find an item, \ie mask read says there are available \glspl{thrd} but none on queue.
    99106
    100 \paragraph{Local Information} Figure~\ref{fig:emptytls} shows an approach using dense information, similar to the bitmap, but each \gls{hthrd} keeps its own independent copy. While this approach can offer good scalability \emph{and} low latency, the liveliness and discovery of the information can become a problem. This case is made worst in systems with few processors where even blind random picks can find \glspl{thrd} in a few tries.
     107\paragraph{Sparse Information} Figure~\ref{fig:emptytree} shows an approach using a hierarchical tree data-structure to reduce contention and has been shown to work in similar cases~\cite{ellen2007snzi}. However, this approach may lead to poorer performance due to the inherent pointer chasing cost while still allowing more contention on the nodes of the tree if the tree is not deep enough.
    101108
    102 I built a prototype of these approaches and none of these techniques offer satisfying performance when few threads are present. All of these approach hit the same 2 problems. First, randomly picking sub-queues is very fast but means any improvement to the hit rate can easily be countered by a slow-down in look-up speed when there are empty lists. Second, the array is already as sharded to avoid contention bottlenecks, so any denser data structure tends to become a bottleneck. In all cases, these factors meant the best cases scenario, \ie many threads, would get worst throughput, and the worst-case scenario, few threads, would get a better hit rate, but an equivalent poor throughput. As a result I tried an entirely different approach.
     109\paragraph{Local Information} Figure~\ref{fig:emptytls} shows an approach using dense information, similar to the bitmap, but have each thread keep its own independent copy of it. While this approach can offer good scalability \emph{and} low latency, the liveliness and discovery of the information can become a problem. This case is made worst in systems with few processors where even blind random picks can find \glspl{thrd} in few tries.
     110
     111I built a prototype of these approach and none of these techniques offer satisfying performance when few threads are present. All of these approach hit the same 2 problems. First, blindly picking two sub-queues is very fast which means that any improvement to the hit rate can easily be countered by a slow-down in look-up speed. Second, the array is already as sharded as is needed to avoid contention bottlenecks, so any denser data structure will tend to become a bottleneck. In all cases, these factors meant that the best cases scenerio, many threads, would get worst throughput and the worst case scenario, few threads, would get a better hit rate, but an equivalent throughput. As a result I tried an entirely different approach.
    103112
    104113\subsection{Dynamic Entropy}\cit{https://xkcd.com/2318/}
    105 In the worst-case scenario there are only few \glspl{thrd} ready to run, or more precisely given $P$ \glspl{proc}\footnote{For simplicity, this assumes there is a one-to-one match between \glspl{proc} and \glspl{hthrd}.}, $T$ \glspl{thrd} and $\epsilon$ a very small number, than the worst case scenario can be represented by $\epsilon \ll P$, than $T = P + \epsilon$. It is important to note in this case that fairness is effectively irrelevant. Indeed, this case is close to \emph{actually matching} the model of the ``Ideal multi-tasking CPU'' on page \pageref{q:LinuxCFS}. In this context, it is possible to use a purely internal-locality based approach and still meet the fairness requirements. This approach simply has each \gls{proc} running a single \gls{thrd} repeatedly. Or from the shared ready-queue viewpoint, each \gls{proc} pushes to a given sub-queue and then popes from the \emph{same} subqueue. In cases where $T \gg P$, the scheduler should also achieves similar performance without affecting the fairness guarantees.
     114In the worst case scenario there are few \glspl{thrd} ready to run, or more accuratly given $P$ \glspl{proc}, $T$ \glspl{thrd} and $\epsilon$, as usual, a very small number, in this case $\epsilon \ll P$, we have $T = P + \epsilon$. An important thing to note is that in this case, fairness is effectively irrelevant. Indeed, this case is close to \emph{actually matching} the model of the ``Ideal multi-tasking CPU'' presented in this chapter\footnote{For simplicity, this assumes there is a one-to-one match between \glspl{proc} and \glspl{hthrd}.}. Therefore, in this context it is possible to use a purely internal locality based approach and still meet the fairness requirements. This approach would simply have each \gls{proc} running a single \gls{thrd} repeatedly. Or from the shared ready-queue viewpoint, each \gls{proc} would push to a given sub-queue and then pop from the \emph{same} subqueue. Ideally, the scheduler would achieve this without affecting the fairness guarantees in cases where $T \gg P$.
    106115
    107 To handle this case, I use a pseudo random-number generator, \glsxtrshort{prng} in a novel way. When the scheduler uses a \glsxtrshort{prng} instance per \gls{proc} exclusively, the random-number seed effectively starts an encoding that produces a list of all accessed subqueues, from latest to oldest. The novel approach is to be able to ``replay'' the \glsxtrshort{prng} backwards and there exist \glsxtrshort{prng}s that are fast, compact \emph{and} can be run forward and backwards. Linear congruential generators~\cite{wiki:lcg} are an example of \glsxtrshort{prng}s that match these requirements.
     116To achieve this, I use a communication channel I have not mentionned yet and which I believe I use in a novel way : the \glsxtrshort{prng}. If the scheduler has a \glsxtrshort{prng} instance per \gls{proc} exclusively used for scheduling, its seed effectively encodes a list of all the accessed subqueues, from the latest to the oldest. The only requirement to achieve this is to be able to ``replay'' the \glsxtrshort{prng} backwards. As it turns out, this is an entirely reasonnable requirement and there already exist \glsxtrshort{prng}s that are fast, compact \emph{and} can be run forward and backwards. Linear congruential generators\cite{wiki:lcg} are an example of \glsxtrshort{prng}s that match these requirements.
    108117
    109 The algorithm works as follows:
     118The algorithm works as follows :
    110119\begin{itemize}
    111120        \item Each \gls{proc} has two \glsxtrshort{prng} instances, $F$ and $B$.
    112         \item Push and Pop operations occur as discussed in Section~\ref{sec:sharding} with the following exceptions:
     121        \item Push and Pop operations happen as mentionned in Section~\ref{sec:sharding} with the following exceptions:
    113122        \begin{itemize}
    114123                \item Push operations use $F$ going forward on each try and on success $F$ is copied into $B$.
     
    117126\end{itemize}
    118127
    119 The main benefit of this technique is that it basically respects the desired properties of Figure~\ref{fig:fair}. When looking for work, a \gls{proc} first looks at the last cell they pushed to, if any, and then move backwards through its accessed cells. As the \gls{proc} continues looking for work, $F$ moves backwards and $B$ stays in place. As a result, the relation between the two becomes weaker, which means that the probablisitic fairness of the algorithm reverts to normal. Chapter~\ref{proofs} discusses more formally the fairness guarantees of this algorithm.
     128The main benefit of this technique is that it basically repects the desired properties of Figure~\ref{fig:fair}. When looking for work, \glspl{proc} will look first at the last cells they pushed to, if any, and then move backwards through the cells. As the \glspl{proc} continue looking for work, $F$ moves back and $B$ stays in place. As a result the relation between the two becomes weaker, which means that the probablisitic fairness of the algorithm reverts to normal. Chapter~\ref{proofs} discusses more formally the fairness guarantees of this algorithm.
    120129
    121130\section{Details}
  • doc/theses/thierry_delisle_PhD/thesis/text/io.tex

    r342af53 r4468a70  
    55Since \glsxtrshort{io} operations are generally handled by the
    66
    7 \subsection{\lstinline|epoll|, \lstinline|poll| and \lstinline|select|}
     7\subsection{\texttt{epoll}, \texttt{poll} and \texttt{select}}
    88
    99\subsection{Linux's AIO}
     
    3333
    3434\subsection{\texttt{io\_uring}}
    35 A very recent addition to Linux, @io_uring@\cit{io\_uring} is a framework that aims to solve many of the problems listed with the above mentioned solutions.
     35A very recent addition to Linux, \texttt{io\_uring}\cit{io\_uring} is a framework that aims to solve many of the problems listed with the above mentioned solutions.
    3636
    3737\subsection{Extra Kernel Threads}\label{io:morethreads}
  • doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex

    r342af53 r4468a70  
    11\chapter{\CFA Runtime}
    2 This chapter presents an overview of the capabilities of the \CFA runtime prior to this thesis work.
     2This chapter offers an overview of the capabilities of the \CFA runtime prior to this work.
    33
    4 \Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@. Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC). While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC. This model uses \glspl{kthrd} to achieve parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completely true due to primitives like \lstinline|futex|es, which have a significant portion of their logic in user space.}.
     4Threading in \CFA offers is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, that is: programmers should be able to create a large number of \glspl{thrd} and switch between \glspl{thrd} liberally without many concerns for performance.
    55
    66\section{M:N Threading}\label{prev:model}
    77
    8 Threading in \CFA is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \glspl{thrd} and switch among \glspl{thrd} liberally without many concerns for performance.
     8C traditionnally uses a 1:1 threading model. This model uses \glspl{kthrd} to achive parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completly true due to primitives like \texttt{futex}es, which have a significant portion of their logic in user space.}.
    99
    10 The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run.
     10By contrast \CFA uses an M:N threading models, where concurrency is achieved using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model, they represent an independant thread of execution with it's on stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then choses a different \gls{thrd} to run.
    1111
    1212\section{Clusters}
    13 \CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} are only be scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. It also opens the door to handling effects like NUMA, by pining clusters to a specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
    14 
    1513\begin{figure}
    1614        \begin{center}
    1715                \input{system.pstex_t}
    1816        \end{center}
    19         \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.}
     17        \caption{Overview of the \CFA runtime}
    2018        \label{fig:system}
     19        \Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.
    2120\end{figure}
     21\CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} will only be scheduled onto \glspl{proc} in the same cluster and scheduling is done independantly of other clusters. Figure~\ref{fig:system} shows an overview if this system. This allows programmers to control more tightly parallelism. It also opens the door to handling effects like NUMA, by pining clusters to specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
    2222
    2323\section{Scheduling}
    24 The \CFA runtime previously used a \glsxtrshort{fifo} ready-queue with a single lock. This setup offers perfect fairness in terms of opportunities to run. However, it offers poor scalability, since the performance of the ready queue can never be improved by adding more \glspl{hthrd} and contention can cause significant performance degradation.
     24The \CFA runtime was previously using a strictly \glsxtrshort{fifo} ready queue with a single lock. This setup offers perfect fairness in terms of opportunities to run/ However, it offers poor scalability, since the performance of the ready queue can never be improved by adding more \glspl{hthrd}, but the contention can cause significant performance degradation.
    2525
    2626\section{\glsxtrshort{io}}\label{prev:io}
    27 Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. %\CFA being built on C, this means that,
    28 While all I/O operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means I/O operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
    29 \begin{quote}
    30 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request. With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
    31 \end{quote}
    32 Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, like \glslink{uthrding}{User-Level \emph{Threading}} blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations, which entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple I/O operations in parallel. This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. Executing I/O operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block.
     27Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. \CFA being built on C, this means that, while all the operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model\cit{pthreads}. Using these operations in a M:N threading model, when they are built for 1:1 threading, means that operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. This also means that deadlocks can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
    3328
    34 \section{Interoperating with C}
    35 While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the non-blocking challenge extends to all blocking system-calls. The POSIX standard states~\cite[\S~2.9.1]{POSIX17}:
    36 \begin{quote}
    37 All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions1 need not be thread-safe. ... (list of 70+ potentially excluded functions)
    38 \end{quote}
    39 Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthread lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
     29Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, the first \gls{thrd} is still ready to run and should just be able to get CPU time and send the request. In practice with M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} in this example will \emph{not} try to run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is effectively deadlocked\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
    4030
    41 Languages like Go and Java, which have strict interoperability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them, \eg a blocking function may be delegated to a \gls{kthrd}. Sandboxing may help towards guaranteeing that the kind of deadlock mentioned above does not occur.
     31One of the objective of this work, is to introduce \emph{User-Level \glsxtrshort{io}} which, as a parallel to \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations. This entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple operations in parallel. This cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its duration. Executing operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} is not blocked.
    4232
    43 As mentioned in Section~\ref{intro}, \CFA is binary compatible with C and, as such, must support all C library functions. Furthermore, interoperability can happen at the function-call level, inline code, or C and \CFA translation units linked together. This fine-grained interoperability between C and \CFA has two consequences:
     33\section{Interoperating with \texttt{C}}
     34While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the challenges mentioned in the previous section do not require \glsxtrshort{io} to be involved. These challenges are a product of blocking system calls rather than \glsxtrshort{io}. C offers no tools to identify whether or not a librairy function will lead to a blocking system call. This fact means interoperatability with C becomes a challenge in a M:N threading model.
     35
     36Languages like Go and Java, which have strict interoperatability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them. They can, for example, delegate C operations to \glspl{kthrd} that are not \glspl{proc}. Sandboxing may help towards guaranteeing that the deadlocks mentioned in the previous section do not occur.
     37
     38As mentioned in Section~\cit{\CFA intro}, \CFA is binary compatible with C and, as such, trivially supports calls to and from C librairies. Furthermore, interoperatability can happen within a single library, through inline code or simply C and \CFA translation units archived together. The fine-grained interoperatability between C and \CFA has two consequences:
    4439\begin{enumerate}
    45         \item Precisely identifying blocking C calls is difficult.
    46         \item Introducing new code can have a significant impact on general performance.
     40        \item Precisely identifying C calls that could block is difficult.
     41        \item Introducing code where interoperatability occurs could have a significant impact on general performance.
    4742\end{enumerate}
    48 Because of these consequences, this work does not attempt to ``sandbox'' calls to C. Therefore, it is possible for an unidentified library calls to block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. Therefore, a complete solution to this problem is outside the scope of this thesis.
     43
     44Because of these consequences, this work does not attempt to ``sandbox'' calls to C. It is possible that conflicting calls to C could lead to deadlocks on \CFA's M:N threading model where they would not in the traditionnal 1:1 threading model. However, I judge that solving this problem in general, in a way that is composable and flexible, is too complex in itself and would add too much work to this thesis. Therefore it is outside the scope of this thesis.
  • doc/theses/thierry_delisle_PhD/thesis/thesis.tex

    r342af53 r4468a70  
    120120% although it's supposed to be in both the TeX Live and MikTeX distributions. There are also documentation and
    121121% installation instructions there.
    122 \renewcommand*{\glstextformat}[1]{\textsf{#1}}
    123122
    124123\usepackage{csquotes}
     
    126125
    127126% Setting up the page margins...
    128 \setlength{\textheight}{9in}\setlength{\topmargin}{-0.45in}\setlength{\headsep}{0.25in}
    129127% uWaterloo thesis requirements specify a minimum of 1 inch (72pt) margin at the
    130128% top, bottom, and outside page edges and a 1.125 in. (81pt) gutter
     
    193191% cfa macros used in the document
    194192\input{common}
    195 \CFAStyle                                               % CFA code-style for all languages
    196 \lstset{basicstyle=\linespread{0.9}\tt}
    197193
    198194% glossary of terms to use
    199195\input{glossary}
    200 \makeindex
    201196
    202197%======================================================================
     
    235230\input{text/io.tex}
    236231\part{Evaluation}
    237 \label{Evaluation}
    238232\chapter{Theoretical and Existance Proofs}
    239233\chapter{Micro-Benchmarks}
     
    262256\addcontentsline{toc}{chapter}{\textbf{References}}
    263257
    264 \bibliography{local,pl}
     258\bibliography{local}
    265259% Tip 5: You can create multiple .bib files to organize your references.
    266260% Just list them all in the \bibliogaphy command, separated by commas (no spaces).
     
    307301\printglossary
    308302\cleardoublepage
    309 
    310 % Index
    311 % -----------------------------
    312 %\input{thesis.ind}                             % index
    313 
    314303\phantomsection
    315304
  • example/io/simple/server_epoll.c

    r342af53 r4468a70  
    8888      }
    8989
    90       ev.events = EPOLLOUT | EPOLLIN | EPOLLONESHOT;
     90      ev.events = EPOLLIN | EPOLLONESHOT;
    9191      ev.data.u64 = (uint64_t)&ring;
    9292      if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ring.ring_fd, &ev) == -1) {
     
    9999
    100100        while(1) {
    101             BLOCK:;
     101            BLOCK:
    102102            int nfds = epoll_wait(epollfd, events, MAX_EVENTS, -1);
    103103            if (nfds == -1) {
  • libcfa/src/Makefile.am

    r342af53 r4468a70  
    5858        concurrency/iofwd.hfa \
    5959        containers/list.hfa \
    60         containers/stackLockFree.hfa \
    61         vec/vec.hfa \
    62         vec/vec2.hfa \
    63         vec/vec3.hfa \
    64         vec/vec4.hfa
     60        containers/stackLockFree.hfa
    6561
    6662inst_headers_src = \
     
    9894        concurrency/clib/cfathread.h \
    9995        concurrency/invoke.h \
    100         concurrency/future.hfa \
    10196        concurrency/kernel/fwd.hfa
    10297
  • libcfa/src/bits/locks.hfa

    r342af53 r4468a70  
    283283                void ^?{}(future_t &) {}
    284284
    285                 void reset(future_t & this) {
    286                         // needs to be in 0p or 1p
    287                         __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
    288                 }
    289 
    290285                // check if the future is available
    291286                bool available( future_t & this ) {
     
    345340
    346341                // Mark the future as abandoned, meaning it will be deleted by the server
    347                 bool abandon( future_t & this ) {
    348                         /* paranoid */ verify( this.ptr != 3p );
    349 
    350                         // Mark the future as abandonned
     342                void abandon( future_t & this ) {
    351343                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
    352 
    353                         // If the future isn't already fulfilled, let the server delete it
    354                         if( got == 0p ) return false;
    355344
    356345                        // got == 2p: the future is ready but the context hasn't fully been consumed
     
    358347                        if( got == 2p ) {
    359348                                while( this.ptr != 1p ) Pause();
    360                                 got = 1p;
    361                         }
    362 
    363                         // The future is completed delete it now
    364                         /* paranoid */ verify( this.ptr != 1p );
    365                         free( &this );
    366                         return true;
     349                        }
     350                        return;
    367351                }
    368352
  • libcfa/src/concurrency/io.cfa

    r342af53 r4468a70  
    3131
    3232        extern "C" {
     33                #include <sys/epoll.h>
    3334                #include <sys/syscall.h>
    3435
     
    4041        #include "kernel/fwd.hfa"
    4142        #include "io/types.hfa"
    42 
    43         static const char * opcodes[] = {
    44                 "OP_NOP",
    45                 "OP_READV",
    46                 "OP_WRITEV",
    47                 "OP_FSYNC",
    48                 "OP_READ_FIXED",
    49                 "OP_WRITE_FIXED",
    50                 "OP_POLL_ADD",
    51                 "OP_POLL_REMOVE",
    52                 "OP_SYNC_FILE_RANGE",
    53                 "OP_SENDMSG",
    54                 "OP_RECVMSG",
    55                 "OP_TIMEOUT",
    56                 "OP_TIMEOUT_REMOVE",
    57                 "OP_ACCEPT",
    58                 "OP_ASYNC_CANCEL",
    59                 "OP_LINK_TIMEOUT",
    60                 "OP_CONNECT",
    61                 "OP_FALLOCATE",
    62                 "OP_OPENAT",
    63                 "OP_CLOSE",
    64                 "OP_FILES_UPDATE",
    65                 "OP_STATX",
    66                 "OP_READ",
    67                 "OP_WRITE",
    68                 "OP_FADVISE",
    69                 "OP_MADVISE",
    70                 "OP_SEND",
    71                 "OP_RECV",
    72                 "OP_OPENAT2",
    73                 "OP_EPOLL_CTL",
    74                 "OP_SPLICE",
    75                 "OP_PROVIDE_BUFFERS",
    76                 "OP_REMOVE_BUFFERS",
    77                 "OP_TEE",
    78                 "INVALID_OP"
    79         };
    8043
    8144        // returns true of acquired as leader or second leader
     
    171134                int ret = 0;
    172135                if( need_sys_to_submit || need_sys_to_complete ) {
    173                         __cfadbg_print_safe(io_core, "Kernel I/O : IO_URING enter %d %u %u\n", ring.fd, to_submit, flags);
    174136                        ret = syscall( __NR_io_uring_enter, ring.fd, to_submit, 0, flags, (sigset_t *)0p, _NSIG / 8);
    175137                        if( ret < 0 ) {
     
    195157        static unsigned __collect_submitions( struct __io_data & ring );
    196158        static __u32 __release_consumed_submission( struct __io_data & ring );
    197         static inline void __clean( volatile struct io_uring_sqe * sqe );
     159
     160        static inline void process(struct io_uring_cqe & cqe ) {
     161                struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
     162                __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
     163
     164                fulfil( *future, cqe.res );
     165        }
    198166
    199167        // Process a single completion message from the io_uring
    200168        // This is NOT thread-safe
    201         static inline void process( volatile struct io_uring_cqe & cqe ) {
    202                 struct io_future_t * future = (struct io_future_t *)(uintptr_t)cqe.user_data;
    203                 __cfadbg_print_safe( io, "Kernel I/O : Syscall completed : cqe %p, result %d for %p\n", &cqe, cqe.res, future );
    204 
    205                 fulfil( *future, cqe.res );
    206         }
    207 
    208169        static [int, bool] __drain_io( & struct __io_data ring ) {
    209170                /* paranoid */ verify( ! __preemption_enabled() );
     
    231192                }
    232193
    233                 __atomic_thread_fence( __ATOMIC_SEQ_CST );
    234 
    235194                // Release the consumed SQEs
    236195                __release_consumed_submission( ring );
     
    250209                for(i; count) {
    251210                        unsigned idx = (head + i) & mask;
    252                         volatile struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
     211                        struct io_uring_cqe & cqe = ring.completion_q.cqes[idx];
    253212
    254213                        /* paranoid */ verify(&cqe);
     
    259218                // Mark to the kernel that the cqe has been seen
    260219                // Ensure that the kernel only sees the new value of the head index after the CQEs have been read.
    261                 __atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_SEQ_CST );
     220                __atomic_thread_fence( __ATOMIC_SEQ_CST );
     221                __atomic_fetch_add( ring.completion_q.head, count, __ATOMIC_RELAXED );
    262222
    263223                return [count, count > 0 || to_submit > 0];
     
    265225
    266226        void main( $io_ctx_thread & this ) {
    267                 __ioctx_register( this );
    268 
    269                 __cfadbg_print_safe(io_core, "Kernel I/O : IO poller %d (%p) ready\n", this.ring->fd, &this);
    270 
    271                 const int reset_cnt = 5;
    272                 int reset = reset_cnt;
     227                epoll_event ev;
     228                __ioctx_register( this, ev );
     229
     230                __cfadbg_print_safe(io_core, "Kernel I/O : IO poller %p for ring %p ready\n", &this, &this.ring);
     231
     232                int reset = 0;
    273233                // Then loop until we need to start
    274                 LOOP:
    275234                while(!__atomic_load_n(&this.done, __ATOMIC_SEQ_CST)) {
    276235                        // Drain the io
     
    280239                                [count, again] = __drain_io( *this.ring );
    281240
    282                                 if(!again) reset--;
     241                                if(!again) reset++;
    283242
    284243                                // Update statistics
     
    290249
    291250                        // If we got something, just yield and check again
    292                         if(reset > 1) {
     251                        if(reset < 5) {
    293252                                yield();
    294                                 continue LOOP;
    295                         }
    296 
    297                         // We alread failed to find completed entries a few time.
    298                         if(reset == 1) {
    299                                 // Rearm the context so it can block
    300                                 // but don't block right away
    301                                 // we need to retry one last time in case
    302                                 // something completed *just now*
    303                                 __ioctx_prepare_block( this );
    304                                 continue LOOP;
    305                         }
    306 
     253                        }
     254                        // We didn't get anything baton pass to the slow poller
     255                        else {
    307256                                __STATS__( false,
    308257                                        io.complete_q.blocks += 1;
    309258                                )
    310                                 __cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %d (%p)\n", this.ring->fd, &this);
     259                                __cfadbg_print_safe(io_core, "Kernel I/O : Parking io poller %p\n", &this.self);
     260                                reset = 0;
    311261
    312262                                // block this thread
     263                                __ioctx_prepare_block( this, ev );
    313264                                wait( this.sem );
    314 
    315                         // restore counter
    316                         reset = reset_cnt;
    317                 }
    318 
    319                 __cfadbg_print_safe(io_core, "Kernel I/O : Fast poller %d (%p) stopping\n", this.ring->fd, &this);
     265                        }
     266                }
     267
     268                __cfadbg_print_safe(io_core, "Kernel I/O : Fast poller for ring %p stopping\n", &this.ring);
    320269        }
    321270
     
    340289//
    341290
    342         // Allocate an submit queue entry.
    343         // The kernel cannot see these entries until they are submitted, but other threads must be
    344         // able to see which entries can be used and which are already un used by an other thread
    345         // for convenience, return both the index and the pointer to the sqe
    346         // sqe == &sqes[idx]
    347         [* volatile struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
     291        [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data ) {
    348292                /* paranoid */ verify( data != 0 );
    349293
     
    360304                        // Look through the list starting at some offset
    361305                        for(i; cnt) {
    362                                 __u64 expected = 3;
    363                                 __u32 idx = (i + off) & mask; // Get an index from a random
    364                                 volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
     306                                __u64 expected = 0;
     307                                __u32 idx = (i + off) & mask;
     308                                struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
    365309                                volatile __u64 * udata = &sqe->user_data;
    366310
    367                                 // Allocate the entry by CASing the user_data field from 0 to the future address
    368311                                if( *udata == expected &&
    369312                                        __atomic_compare_exchange_n( udata, &expected, data, true, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED ) )
     
    376319                                        )
    377320
    378                                         // debug log
    379                                         __cfadbg_print_safe( io, "Kernel I/O : allocated [%p, %u] for %p (%p)\n", sqe, idx, active_thread(), (void*)data );
    380321
    381322                                        // Success return the data
     
    384325                                verify(expected != data);
    385326
    386                                 // This one was used
    387327                                len ++;
    388328                        }
    389329
    390330                        block++;
    391 
    392                         abort( "Kernel I/O : all submit queue entries used, yielding\n" );
    393 
    394331                        yield();
    395332                }
     
    440377        void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1))) {
    441378                __io_data & ring = *ctx->thrd.ring;
    442 
    443                 {
    444                         __attribute__((unused)) volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[idx];
    445                         __cfadbg_print_safe( io,
    446                                 "Kernel I/O : submitting %u (%p) for %p\n"
    447                                 "    data: %p\n"
    448                                 "    opcode: %s\n"
    449                                 "    fd: %d\n"
    450                                 "    flags: %d\n"
    451                                 "    prio: %d\n"
    452                                 "    off: %p\n"
    453                                 "    addr: %p\n"
    454                                 "    len: %d\n"
    455                                 "    other flags: %d\n"
    456                                 "    splice fd: %d\n"
    457                                 "    pad[0]: %llu\n"
    458                                 "    pad[1]: %llu\n"
    459                                 "    pad[2]: %llu\n",
    460                                 idx, sqe,
    461                                 active_thread(),
    462                                 (void*)sqe->user_data,
    463                                 opcodes[sqe->opcode],
    464                                 sqe->fd,
    465                                 sqe->flags,
    466                                 sqe->ioprio,
    467                                 sqe->off,
    468                                 sqe->addr,
    469                                 sqe->len,
    470                                 sqe->accept_flags,
    471                                 sqe->splice_fd_in,
    472                                 sqe->__pad2[0],
    473                                 sqe->__pad2[1],
    474                                 sqe->__pad2[2]
    475                         );
    476                 }
    477 
    478 
    479379                // Get now the data we definetely need
    480380                volatile __u32 * const tail = ring.submit_q.tail;
     
    543443                                unlock(ring.submit_q.submit_lock);
    544444                        #endif
    545                         if( ret < 0 ) {
    546                                 return;
    547                         }
     445                        if( ret < 0 ) return;
    548446
    549447                        // Release the consumed SQEs
     
    556454                                io.submit_q.submit_avg.cnt += 1;
    557455                        )
    558 
    559                         __cfadbg_print_safe( io, "Kernel I/O : submitted %u (among %u) for %p\n", idx, ret, active_thread() );
    560                 }
    561                 else
    562                 {
     456                }
     457                else {
    563458                        // get mutual exclusion
    564459                        #if defined(LEADER_LOCK)
     
    568463                        #endif
    569464
    570                         /* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 3ul64,
     465                        /* paranoid */ verifyf( ring.submit_q.sqes[ idx ].user_data != 0,
    571466                        /* paranoid */  "index %u already reclaimed\n"
    572467                        /* paranoid */  "head %u, prev %u, tail %u\n"
     
    595490                        }
    596491
    597                         /* paranoid */ verify(ret == 1);
    598 
    599492                        // update statistics
    600493                        __STATS__( false,
     
    603496                        )
    604497
    605                         {
    606                                 __attribute__((unused)) volatile __u32 * const head = ring.submit_q.head;
    607                                 __attribute__((unused)) __u32 last_idx = ring.submit_q.array[ ((*head) - 1) & mask ];
    608                                 __attribute__((unused)) volatile struct io_uring_sqe * sqe = &ring.submit_q.sqes[last_idx];
    609 
    610                                 __cfadbg_print_safe( io,
    611                                         "Kernel I/O : last submitted is %u (%p)\n"
    612                                         "    data: %p\n"
    613                                         "    opcode: %s\n"
    614                                         "    fd: %d\n"
    615                                         "    flags: %d\n"
    616                                         "    prio: %d\n"
    617                                         "    off: %p\n"
    618                                         "    addr: %p\n"
    619                                         "    len: %d\n"
    620                                         "    other flags: %d\n"
    621                                         "    splice fd: %d\n"
    622                                         "    pad[0]: %llu\n"
    623                                         "    pad[1]: %llu\n"
    624                                         "    pad[2]: %llu\n",
    625                                         last_idx, sqe,
    626                                         (void*)sqe->user_data,
    627                                         opcodes[sqe->opcode],
    628                                         sqe->fd,
    629                                         sqe->flags,
    630                                         sqe->ioprio,
    631                                         sqe->off,
    632                                         sqe->addr,
    633                                         sqe->len,
    634                                         sqe->accept_flags,
    635                                         sqe->splice_fd_in,
    636                                         sqe->__pad2[0],
    637                                         sqe->__pad2[1],
    638                                         sqe->__pad2[2]
    639                                 );
    640                         }
    641 
    642                         __atomic_thread_fence( __ATOMIC_SEQ_CST );
    643498                        // Release the consumed SQEs
    644499                        __release_consumed_submission( ring );
    645                         // ring.submit_q.sqes[idx].user_data = 3ul64;
    646500
    647501                        #if defined(LEADER_LOCK)
     
    651505                        #endif
    652506
    653                         __cfadbg_print_safe( io, "Kernel I/O : submitted %u for %p\n", idx, active_thread() );
     507                        __cfadbg_print_safe( io, "Kernel I/O : Performed io_submit for %p, returned %d\n", active_thread(), ret );
    654508                }
    655509        }
    656510
    657511        // #define PARTIAL_SUBMIT 32
    658 
    659         // go through the list of submissions in the ready array and moved them into
    660         // the ring's submit queue
    661512        static unsigned __collect_submitions( struct __io_data & ring ) {
    662513                /* paranoid */ verify( ring.submit_q.ready != 0p );
     
    699550        }
    700551
    701         // Go through the ring's submit queue and release everything that has already been consumed
    702         // by io_uring
    703552        static __u32 __release_consumed_submission( struct __io_data & ring ) {
    704553                const __u32 smask = *ring.submit_q.mask;
    705554
    706                 // We need to get the lock to copy the old head and new head
    707555                if( !try_lock(ring.submit_q.release_lock __cfaabi_dbg_ctx2) ) return 0;
    708                 __attribute__((unused))
    709                 __u32 ctail = *ring.submit_q.tail;        // get the current tail of the queue
    710                 __u32 chead = *ring.submit_q.head;              // get the current head of the queue
    711                 __u32 phead = ring.submit_q.prev_head;  // get the head the last time we were here
    712                 ring.submit_q.prev_head = chead;                // note up to were we processed
     556                __u32 chead = *ring.submit_q.head;
     557                __u32 phead = ring.submit_q.prev_head;
     558                ring.submit_q.prev_head = chead;
    713559                unlock(ring.submit_q.release_lock);
    714560
    715                 // the 3 fields are organized like this diagram
    716                 // except it's are ring
    717                 // ---+--------+--------+----
    718                 // ---+--------+--------+----
    719                 //    ^        ^        ^
    720                 // phead    chead    ctail
    721 
    722                 // make sure ctail doesn't wrap around and reach phead
    723                 /* paranoid */ verify(
    724                            (ctail >= chead && chead >= phead)
    725                         || (chead >= phead && phead >= ctail)
    726                         || (phead >= ctail && ctail >= chead)
    727                 );
    728 
    729                 // find the range we need to clear
    730561                __u32 count = chead - phead;
    731 
    732                 // We acquired an previous-head/current-head range
    733                 // go through the range and release the sqes
    734562                for( i; count ) {
    735563                        __u32 idx = ring.submit_q.array[ (phead + i) & smask ];
    736 
    737                         /* paranoid */ verify( 0 != ring.submit_q.sqes[ idx ].user_data );
    738                         __clean( &ring.submit_q.sqes[ idx ] );
     564                        ring.submit_q.sqes[ idx ].user_data = 0;
    739565                }
    740566                return count;
    741567        }
    742 
    743         void __sqe_clean( volatile struct io_uring_sqe * sqe ) {
    744                 __clean( sqe );
    745         }
    746 
    747         static inline void __clean( volatile struct io_uring_sqe * sqe ) {
    748                 // If we are in debug mode, thrash the fields to make sure we catch reclamation errors
    749                 __cfaabi_dbg_debug_do(
    750                         memset(sqe, 0xde, sizeof(*sqe));
    751                         sqe->opcode = (sizeof(opcodes) / sizeof(const char *)) - 1;
    752                 );
    753 
    754                 // Mark the entry as unused
    755                 __atomic_store_n(&sqe->user_data, 3ul64, __ATOMIC_SEQ_CST);
    756         }
    757568#endif
  • libcfa/src/concurrency/io/call.cfa.in

    r342af53 r4468a70  
    7474        ;
    7575
    76         extern [* volatile struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
     76        extern [* struct io_uring_sqe, __u32] __submit_alloc( struct __io_data & ring, __u64 data );
    7777        extern void __submit( struct io_context * ctx, __u32 idx ) __attribute__((nonnull (1)));
    7878
     
    222222                __u32 idx;
    223223                struct io_uring_sqe * sqe;
    224                 [(volatile struct io_uring_sqe *) sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
    225 
     224                [sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
     225
     226                sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
    226227                sqe->opcode = IORING_OP_{op};
    227                 sqe->flags = sflags;
    228                 sqe->ioprio = 0;
    229                 sqe->fd = 0;
    230                 sqe->off = 0;
    231                 sqe->addr = 0;
    232                 sqe->len = 0;
    233                 sqe->fsync_flags = 0;
    234                 sqe->__pad2[0] = 0;
    235                 sqe->__pad2[1] = 0;
    236                 sqe->__pad2[2] = 0;{body}
    237 
    238                 asm volatile("": : :"memory");
     228                sqe->flags = sflags;{body}
    239229
    240230                verify( sqe->user_data == (__u64)(uintptr_t)&future );
     
    322312        }),
    323313        # CFA_HAVE_IORING_OP_ACCEPT
    324         Call('ACCEPT', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
    325                 'fd': 'sockfd',
    326                 'addr': '(__u64)addr',
    327                 'addr2': '(__u64)addrlen',
     314        Call('ACCEPT4', 'int accept4(int sockfd, struct sockaddr *addr, socklen_t *addrlen, int flags)', {
     315                'fd': 'sockfd',
     316                'addr': 'addr',
     317                'addr2': 'addrlen',
    328318                'accept_flags': 'flags'
    329319        }),
     
    474464
    475465print("""
    476 //-----------------------------------------------------------------------------
    477 bool cancel(io_cancellation & this) {
    478         #if !defined(CFA_HAVE_LINUX_IO_URING_H) || !defined(CFA_HAVE_IORING_OP_ASYNC_CANCEL)
    479                 return false;
    480         #else
    481                 io_future_t future;
    482 
    483                 io_context * context = __get_io_context();
    484 
    485                 __u8 sflags = 0;
    486                 struct __io_data & ring = *context->thrd.ring;
    487 
    488                 __u32 idx;
    489                 volatile struct io_uring_sqe * sqe;
    490                 [sqe, idx] = __submit_alloc( ring, (__u64)(uintptr_t)&future );
    491 
    492                 sqe->__pad2[0] = sqe->__pad2[1] = sqe->__pad2[2] = 0;
    493                 sqe->opcode = IORING_OP_ASYNC_CANCEL;
    494                 sqe->flags = sflags;
    495                 sqe->addr = this.target;
    496 
    497                 verify( sqe->user_data == (__u64)(uintptr_t)&future );
    498                 __submit( context, idx );
    499 
    500                 wait(future);
    501 
    502                 if( future.result == 0 ) return true; // Entry found
    503                 if( future.result == -EALREADY) return true; // Entry found but in progress
    504                 if( future.result == -ENOENT ) return false; // Entry not found
    505                 return false;
    506         #endif
    507 }
    508 
    509466//-----------------------------------------------------------------------------
    510467// Check if a function is has asynchronous
  • libcfa/src/concurrency/io/setup.cfa

    r342af53 r4468a70  
    5252                #include <pthread.h>
    5353                #include <sys/epoll.h>
    54                 #include <sys/eventfd.h>
    5554                #include <sys/mman.h>
    5655                #include <sys/syscall.h>
     
    170169                // Main loop
    171170                while( iopoll.run ) {
    172                         __cfadbg_print_safe(io_core, "Kernel I/O - epoll : waiting on io_uring contexts\n");
    173 
    174171                        // Wait for events
    175172                        int nfds = epoll_pwait( iopoll.epollfd, events, 10, -1, &mask );
    176 
    177                         __cfadbg_print_safe(io_core, "Kernel I/O - epoll : %d io contexts events, waking up\n", nfds);
    178173
    179174                        // Check if an error occured
     
    186181                                $io_ctx_thread * io_ctx = ($io_ctx_thread *)(uintptr_t)events[i].data.u64;
    187182                                /* paranoid */ verify( io_ctx );
    188                                 __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Unparking io poller %d (%p)\n", io_ctx->ring->fd, io_ctx);
     183                                __cfadbg_print_safe(io_core, "Kernel I/O : Unparking io poller %p\n", io_ctx);
    189184                                #if !defined( __CFA_NO_STATISTICS__ )
    190185                                        __cfaabi_tls.this_stats = io_ctx->self.curr_cluster->stats;
    191186                                #endif
    192 
    193                                 eventfd_t v;
    194                                 eventfd_read(io_ctx->ring->efd, &v);
    195 
    196187                                post( io_ctx->sem );
    197188                        }
     
    242233                $thread & thrd = this.thrd.self;
    243234                if( cluster_context ) {
    244                         // We are about to do weird things with the threads
    245                         // we don't need interrupts to complicate everything
    246                         disable_interrupts();
    247 
    248                         // Get cluster info
    249235                        cluster & cltr = *thrd.curr_cluster;
    250236                        /* paranoid */ verify( cltr.idles.total == 0 || &cltr == mainCluster );
     
    253239                        // We need to adjust the clean-up based on where the thread is
    254240                        if( thrd.state == Ready || thrd.preempted != __NO_PREEMPTION ) {
    255                                 // This is the tricky case
    256                                 // The thread was preempted or ready to run and now it is on the ready queue
    257                                 // but the cluster is shutting down, so there aren't any processors to run the ready queue
    258                                 // the solution is to steal the thread from the ready-queue and pretend it was blocked all along
    259241
    260242                                ready_schedule_lock();
    261                                         // The thread should on the list
     243
     244                                        // This is the tricky case
     245                                        // The thread was preempted and now it is on the ready queue
     246                                        // The thread should be the last on the list
    262247                                        /* paranoid */ verify( thrd.link.next != 0p );
    263248
    264249                                        // Remove the thread from the ready queue of this cluster
    265                                         // The thread should be the last on the list
    266250                                        __attribute__((unused)) bool removed = remove_head( &cltr, &thrd );
    267251                                        /* paranoid */ verify( removed );
     
    279263                        }
    280264                        // !!! This is not an else if !!!
    281                         // Ok, now the thread is blocked (whether we cheated to get here or not)
    282265                        if( thrd.state == Blocked ) {
     266
    283267                                // This is the "easy case"
    284268                                // The thread is parked and can easily be moved to active cluster
     
    290274                        }
    291275                        else {
     276
    292277                                // The thread is in a weird state
    293278                                // I don't know what to do here
    294279                                abort("io_context poller thread is in unexpected state, cannot clean-up correctly\n");
    295280                        }
    296 
    297                         // The weird thread kidnapping stuff is over, restore interrupts.
    298                         enable_interrupts( __cfaabi_dbg_ctx );
    299281                } else {
    300282                        post( this.thrd.sem );
     
    383365                }
    384366
    385                 // Step 3 : Initialize the data structure
    386367                // Get the pointers from the kernel to fill the structure
    387368                // submit queue
     
    398379                        const __u32 num = *sq.num;
    399380                        for( i; num ) {
    400                                 __sqe_clean( &sq.sqes[i] );
     381                                sq.sqes[i].user_data = 0ul64;
    401382                        }
    402383                }
     
    428409                cq.cqes = (struct io_uring_cqe *)(((intptr_t)cq.ring_ptr) + params.cq_off.cqes);
    429410
    430                 // Step 4 : eventfd
    431                 int efd;
    432                 for() {
    433                         efd = eventfd(0, 0);
    434                         if (efd < 0) {
    435                                 if (errno == EINTR) continue;
    436                                 abort("KERNEL ERROR: IO_URING EVENTFD - %s\n", strerror(errno));
    437                         }
    438                         break;
    439                 }
    440 
    441                 int ret;
    442                 for() {
    443                         ret = syscall( __NR_io_uring_register, fd, IORING_REGISTER_EVENTFD, &efd, 1);
    444                         if (ret < 0) {
    445                                 if (errno == EINTR) continue;
    446                                 abort("KERNEL ERROR: IO_URING EVENTFD REGISTER - %s\n", strerror(errno));
    447                         }
    448                         break;
    449                 }
    450 
    451411                // some paranoid checks
    452412                /* paranoid */ verifyf( (*cq.mask) == ((*cq.num) - 1ul32), "IO_URING Expected mask to be %u (%u entries), was %u", (*cq.num) - 1ul32, *cq.num, *cq.mask  );
     
    463423                this.ring_flags = params.flags;
    464424                this.fd         = fd;
    465                 this.efd        = efd;
    466425                this.eager_submits  = params_in.eager_submits;
    467426                this.poller_submits = params_in.poller_submits;
     
    486445                // close the file descriptor
    487446                close(this.fd);
    488                 close(this.efd);
    489447
    490448                free( this.submit_q.ready ); // Maybe null, doesn't matter
     
    494452// I/O Context Sleep
    495453//=============================================================================================
    496         #define IOEVENTS EPOLLIN | EPOLLONESHOT
    497 
    498         static inline void __ioctx_epoll_ctl($io_ctx_thread & ctx, int op, const char * error) {
    499                 struct epoll_event ev;
    500                 ev.events = IOEVENTS;
     454
     455        void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev) {
     456                ev.events = EPOLLIN | EPOLLONESHOT;
    501457                ev.data.u64 = (__u64)&ctx;
    502                 int ret = epoll_ctl(iopoll.epollfd, op, ctx.ring->efd, &ev);
     458                int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_ADD, ctx.ring->fd, &ev);
    503459                if (ret < 0) {
    504                         abort( "KERNEL ERROR: EPOLL %s - (%d) %s\n", error, (int)errno, strerror(errno) );
    505                 }
    506         }
    507 
    508         void __ioctx_register($io_ctx_thread & ctx) {
    509                 __ioctx_epoll_ctl(ctx, EPOLL_CTL_ADD, "ADD");
    510         }
    511 
    512         void __ioctx_prepare_block($io_ctx_thread & ctx) {
    513                 __cfadbg_print_safe(io_core, "Kernel I/O - epoll : Re-arming io poller %d (%p)\n", ctx.ring->fd, &ctx);
    514                 __ioctx_epoll_ctl(ctx, EPOLL_CTL_MOD, "REARM");
     460                        abort( "KERNEL ERROR: EPOLL ADD - (%d) %s\n", (int)errno, strerror(errno) );
     461                }
     462        }
     463
     464        void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev) {
     465                int ret = epoll_ctl(iopoll.epollfd, EPOLL_CTL_MOD, ctx.ring->fd, &ev);
     466                if (ret < 0) {
     467                        abort( "KERNEL ERROR: EPOLL REARM - (%d) %s\n", (int)errno, strerror(errno) );
     468                }
    515469        }
    516470
  • libcfa/src/concurrency/io/types.hfa

    r342af53 r4468a70  
    6565
    6666                // A buffer of sqes (not the actual ring)
    67                 volatile struct io_uring_sqe * sqes;
     67                struct io_uring_sqe * sqes;
    6868
    6969                // The location and size of the mmaped area
     
    8585
    8686                // the kernel ring
    87                 volatile struct io_uring_cqe * cqes;
     87                struct io_uring_cqe * cqes;
    8888
    8989                // The location and size of the mmaped area
     
    9797                __u32 ring_flags;
    9898                int fd;
    99                 int efd;
    10099                bool eager_submits:1;
    101100                bool poller_submits:1;
     
    131130        #endif
    132131
     132        struct epoll_event;
    133133        struct $io_ctx_thread;
    134         void __ioctx_register($io_ctx_thread & ctx);
    135         void __ioctx_prepare_block($io_ctx_thread & ctx);
    136         void __sqe_clean( volatile struct io_uring_sqe * sqe );
     134        void __ioctx_register($io_ctx_thread & ctx, struct epoll_event & ev);
     135        void __ioctx_prepare_block($io_ctx_thread & ctx, struct epoll_event & ev);
    137136#endif
    138137
  • libcfa/src/concurrency/monitor.hfa

    r342af53 r4468a70  
    132132
    133133              void wait        ( condition & this, uintptr_t user_info = 0 );
    134 static inline bool is_empty    ( condition & this ) { return this.blocked.head == 1p; }
    135134              bool signal      ( condition & this );
    136135              bool signal_block( condition & this );
    137 static inline bool signal_all  ( condition & this ) { bool ret = false; while(!is_empty(this)) { ret = signal(this) || ret; } return ret; }
     136static inline bool is_empty    ( condition & this ) { return this.blocked.head == 1p; }
    138137         uintptr_t front       ( condition & this );
    139138
  • libcfa/src/heap.cfa

    r342af53 r4468a70  
    1010// Created On       : Tue Dec 19 21:58:35 2017
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sun Jan 10 11:20:49 2021
    13 // Update Count     : 1031
     12// Last Modified On : Wed Dec 16 12:28:25 2020
     13// Update Count     : 1023
    1414//
    1515
     
    262262#ifdef __STATISTICS__
    263263// Heap statistics counters.
    264 static unsigned int malloc_zero_calls, malloc_calls;
     264static unsigned int malloc_calls;
    265265static unsigned long long int malloc_storage;
    266 static unsigned int aalloc_zero_calls, aalloc_calls;
     266static unsigned int aalloc_calls;
    267267static unsigned long long int aalloc_storage;
    268 static unsigned int calloc_zero_calls, calloc_calls;
     268static unsigned int calloc_calls;
    269269static unsigned long long int calloc_storage;
    270 static unsigned int memalign_zero_calls, memalign_calls;
     270static unsigned int memalign_calls;
    271271static unsigned long long int memalign_storage;
    272 static unsigned int amemalign_zero_calls, amemalign_calls;
     272static unsigned int amemalign_calls;
    273273static unsigned long long int amemalign_storage;
    274 static unsigned int cmemalign_zero_calls, cmemalign_calls;
     274static unsigned int cmemalign_calls;
    275275static unsigned long long int cmemalign_storage;
    276 static unsigned int resize_zero_calls, resize_calls;
     276static unsigned int resize_calls;
    277277static unsigned long long int resize_storage;
    278 static unsigned int realloc_zero_calls, realloc_calls;
     278static unsigned int realloc_calls;
    279279static unsigned long long int realloc_storage;
    280 static unsigned int free_zero_calls, free_calls;
     280static unsigned int free_calls;
    281281static unsigned long long int free_storage;
    282282static unsigned int mmap_calls;
     
    287287static unsigned long long int sbrk_storage;
    288288// Statistics file descriptor (changed by malloc_stats_fd).
    289 static int stats_fd = STDERR_FILENO;                                    // default stderr
     289static int stat_fd = STDERR_FILENO;                                             // default stderr
    290290
    291291// Use "write" because streams may be shutdown when calls are made.
     
    293293        char helpText[1024];
    294294        __cfaabi_bits_print_buffer( STDERR_FILENO, helpText, sizeof(helpText),
    295                                                                 "\nHeap statistics:\n"
    296                                                                 "  malloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    297                                                                 "  aalloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    298                                                                 "  calloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    299                                                                 "  memalign  0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    300                                                                 "  amemalign 0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    301                                                                 "  cmemalign 0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    302                                                                 "  resize    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    303                                                                 "  realloc   0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    304                                                                 "  free      0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
    305                                                                 "  mmap      calls %'u; storage %'llu bytes\n"
    306                                                                 "  munmap    calls %'u; storage %'llu bytes\n"
    307                                                                 "  sbrk      calls %'u; storage %'llu bytes\n",
    308                                                                 malloc_zero_calls, malloc_calls, malloc_storage,
    309                                                                 aalloc_zero_calls, aalloc_calls, aalloc_storage,
    310                                                                 calloc_zero_calls, calloc_calls, calloc_storage,
    311                                                                 memalign_zero_calls, memalign_calls, memalign_storage,
    312                                                                 amemalign_zero_calls, amemalign_calls, amemalign_storage,
    313                                                                 cmemalign_zero_calls, cmemalign_calls, cmemalign_storage,
    314                                                                 resize_zero_calls, resize_calls, resize_storage,
    315                                                                 realloc_zero_calls, realloc_calls, realloc_storage,
    316                                                                 free_zero_calls, free_calls, free_storage,
    317                                                                 mmap_calls, mmap_storage,
    318                                                                 munmap_calls, munmap_storage,
    319                                                                 sbrk_calls, sbrk_storage
     295                                                                        "\nHeap statistics:\n"
     296                                                                        "  malloc: calls %u / storage %llu\n"
     297                                                                        "  aalloc: calls %u / storage %llu\n"
     298                                                                        "  calloc: calls %u / storage %llu\n"
     299                                                                        "  memalign: calls %u / storage %llu\n"
     300                                                                        "  amemalign: calls %u / storage %llu\n"
     301                                                                        "  cmemalign: calls %u / storage %llu\n"
     302                                                                        "  resize: calls %u / storage %llu\n"
     303                                                                        "  realloc: calls %u / storage %llu\n"
     304                                                                        "  free: calls %u / storage %llu\n"
     305                                                                        "  mmap: calls %u / storage %llu\n"
     306                                                                        "  munmap: calls %u / storage %llu\n"
     307                                                                        "  sbrk: calls %u / storage %llu\n",
     308                                                                        malloc_calls, malloc_storage,
     309                                                                        aalloc_calls, aalloc_storage,
     310                                                                        calloc_calls, calloc_storage,
     311                                                                        memalign_calls, memalign_storage,
     312                                                                        amemalign_calls, amemalign_storage,
     313                                                                        cmemalign_calls, cmemalign_storage,
     314                                                                        resize_calls, resize_storage,
     315                                                                        realloc_calls, realloc_storage,
     316                                                                        free_calls, free_storage,
     317                                                                        mmap_calls, mmap_storage,
     318                                                                        munmap_calls, munmap_storage,
     319                                                                        sbrk_calls, sbrk_storage
    320320                );
    321321} // printStats
     
    328328                                                "<sizes>\n"
    329329                                                "</sizes>\n"
    330                                                 "<total type=\"malloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    331                                                 "<total type=\"aalloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    332                                                 "<total type=\"calloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    333                                                 "<total type=\"memalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    334                                                 "<total type=\"amemalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    335                                                 "<total type=\"cmemalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    336                                                 "<total type=\"resize\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    337                                                 "<total type=\"realloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    338                                                 "<total type=\"free\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    339                                                 "<total type=\"mmap\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    340                                                 "<total type=\"munmap\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
    341                                                 "<total type=\"sbrk\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
     330                                                "<total type=\"malloc\" count=\"%u\" size=\"%llu\"/>\n"
     331                                                "<total type=\"aalloc\" count=\"%u\" size=\"%llu\"/>\n"
     332                                                "<total type=\"calloc\" count=\"%u\" size=\"%llu\"/>\n"
     333                                                "<total type=\"memalign\" count=\"%u\" size=\"%llu\"/>\n"
     334                                                "<total type=\"amemalign\" count=\"%u\" size=\"%llu\"/>\n"
     335                                                "<total type=\"cmemalign\" count=\"%u\" size=\"%llu\"/>\n"
     336                                                "<total type=\"resize\" count=\"%u\" size=\"%llu\"/>\n"
     337                                                "<total type=\"realloc\" count=\"%u\" size=\"%llu\"/>\n"
     338                                                "<total type=\"free\" count=\"%u\" size=\"%llu\"/>\n"
     339                                                "<total type=\"mmap\" count=\"%u\" size=\"%llu\"/>\n"
     340                                                "<total type=\"munmap\" count=\"%u\" size=\"%llu\"/>\n"
     341                                                "<total type=\"sbrk\" count=\"%u\" size=\"%llu\"/>\n"
    342342                                                "</malloc>",
    343                                                 malloc_zero_calls, malloc_calls, malloc_storage,
    344                                                 aalloc_zero_calls, aalloc_calls, aalloc_storage,
    345                                                 calloc_zero_calls, calloc_calls, calloc_storage,
    346                                                 memalign_zero_calls, memalign_calls, memalign_storage,
    347                                                 amemalign_zero_calls, amemalign_calls, amemalign_storage,
    348                                                 cmemalign_zero_calls, cmemalign_calls, cmemalign_storage,
    349                                                 resize_zero_calls, resize_calls, resize_storage,
    350                                                 realloc_zero_calls, realloc_calls, realloc_storage,
    351                                                 free_zero_calls, free_calls, free_storage,
     343                                                malloc_calls, malloc_storage,
     344                                                aalloc_calls, aalloc_storage,
     345                                                calloc_calls, calloc_storage,
     346                                                memalign_calls, memalign_storage,
     347                                                amemalign_calls, amemalign_storage,
     348                                                cmemalign_calls, cmemalign_storage,
     349                                                resize_calls, resize_storage,
     350                                                realloc_calls, realloc_storage,
     351                                                free_calls, free_storage,
    352352                                                mmap_calls, mmap_storage,
    353353                                                munmap_calls, munmap_storage,
     
    466466} // headers
    467467
    468 // #ifdef __CFA_DEBUG__
    469 // #if __SIZEOF_POINTER__ == 4
    470 // #define MASK 0xdeadbeef
    471 // #else
    472 // #define MASK 0xdeadbeefdeadbeef
    473 // #endif
    474 // #define STRIDE size_t
    475 
    476 // static void * Memset( void * addr, STRIDE size ) {           // debug only
    477 //      if ( size % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, size %zd not multiple of %zd.", size, sizeof(STRIDE) );
    478 //      if ( (STRIDE)addr % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, addr %p not multiple of %zd.", addr, sizeof(STRIDE) );
    479 
    480 //      STRIDE * end = (STRIDE *)addr + size / sizeof(STRIDE);
    481 //      for ( STRIDE * p = (STRIDE *)addr; p < end; p += 1 ) *p = MASK;
    482 //      return addr;
    483 // } // Memset
    484 // #endif // __CFA_DEBUG__
     468#ifdef __CFA_DEBUG__
     469#if __SIZEOF_POINTER__ == 4
     470#define MASK 0xdeadbeef
     471#else
     472#define MASK 0xdeadbeefdeadbeef
     473#endif
     474#define STRIDE size_t
     475
     476static void * Memset( void * addr, STRIDE size ) {              // debug only
     477        if ( size % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, size %zd not multiple of %zd.", size, sizeof(STRIDE) );
     478        if ( (STRIDE)addr % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, addr %p not multiple of %zd.", addr, sizeof(STRIDE) );
     479
     480        STRIDE * end = (STRIDE *)addr + size / sizeof(STRIDE);
     481        for ( STRIDE * p = (STRIDE *)addr; p < end; p += 1 ) *p = MASK;
     482        return addr;
     483} // Memset
     484#endif // __CFA_DEBUG__
    485485
    486486
     
    498498                        unlock( extlock );
    499499                        __cfaabi_bits_print_nolock( STDERR_FILENO, NO_MEMORY_MSG, size );
    500                         _exit( EXIT_FAILURE );                                          // give up
    501                 } // if
    502                 // Make storage executable for thunks.
     500                        _exit( EXIT_FAILURE );
     501                } // if
    503502                if ( mprotect( (char *)heapEnd + heapRemaining, increase, __map_prot ) ) {
    504503                        unlock( extlock );
     
    771770
    772771
     772static inline void * callocNoStats( size_t dim, size_t elemSize ) {
     773        size_t size = dim * elemSize;
     774  if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
     775        char * addr = (char *)mallocNoStats( size );
     776
     777        HeapManager.Storage.Header * header;
     778        HeapManager.FreeHeader * freeElem;
     779        size_t bsize, alignment;
     780        #ifndef __CFA_DEBUG__
     781        bool mapped =
     782        #endif // __CFA_DEBUG__
     783                headers( "calloc", addr, header, freeElem, bsize, alignment );
     784        #ifndef __CFA_DEBUG__
     785
     786        // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
     787        if ( ! mapped )
     788        #endif // __CFA_DEBUG__
     789                // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
     790                // `-header`-addr                      `-size
     791                memset( addr, '\0', size );                                             // set to zeros
     792
     793        header->kind.real.blockSize |= 2;                                       // mark as zero filled
     794        return addr;
     795} // callocNoStats
     796
     797
    773798static inline void * memalignNoStats( size_t alignment, size_t size ) {
    774799  if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
     
    809834
    810835
     836static inline void * cmemalignNoStats( size_t alignment, size_t dim, size_t elemSize ) {
     837        size_t size = dim * elemSize;
     838  if ( unlikely( size ) == 0 ) return 0p;                               // 0 BYTE ALLOCATION RETURNS NULL POINTER
     839        char * addr = (char *)memalignNoStats( alignment, size );
     840
     841        HeapManager.Storage.Header * header;
     842        HeapManager.FreeHeader * freeElem;
     843        size_t bsize;
     844        #ifndef __CFA_DEBUG__
     845        bool mapped =
     846        #endif // __CFA_DEBUG__
     847                headers( "cmemalign", addr, header, freeElem, bsize, alignment );
     848
     849        // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
     850        #ifndef __CFA_DEBUG__
     851        if ( ! mapped )
     852        #endif // __CFA_DEBUG__
     853                // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
     854                // `-header`-addr                      `-size
     855                memset( addr, '\0', size );                                             // set to zeros
     856
     857        header->kind.real.blockSize |= 2;                                       // mark as zero filled
     858        return addr;
     859} // cmemalignNoStats
     860
     861
    811862extern "C" {
    812863        // Allocates size bytes and returns a pointer to the allocated memory.  The contents are undefined. If size is 0,
     
    814865        void * malloc( size_t size ) {
    815866                #ifdef __STATISTICS__
    816                 if ( likely( size > 0 ) ) {
    817                         __atomic_add_fetch( &malloc_calls, 1, __ATOMIC_SEQ_CST );
    818                         __atomic_add_fetch( &malloc_storage, size, __ATOMIC_SEQ_CST );
    819                 } else {
    820                         __atomic_add_fetch( &malloc_zero_calls, 1, __ATOMIC_SEQ_CST );
    821                 } // if
     867                __atomic_add_fetch( &malloc_calls, 1, __ATOMIC_SEQ_CST );
     868                __atomic_add_fetch( &malloc_storage, size, __ATOMIC_SEQ_CST );
    822869                #endif // __STATISTICS__
    823870
     
    830877                size_t size = dim * elemSize;
    831878                #ifdef __STATISTICS__
    832                 if ( likely( size > 0 ) ) {
    833                         __atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
    834                         __atomic_add_fetch( &aalloc_storage, size, __ATOMIC_SEQ_CST );
    835                 } else {
    836                         __atomic_add_fetch( &aalloc_zero_calls, 1, __ATOMIC_SEQ_CST );
    837                 } // if
     879                __atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
     880                __atomic_add_fetch( &aalloc_storage, size, __ATOMIC_SEQ_CST );
    838881                #endif // __STATISTICS__
    839882
     
    844887        // Same as aalloc() with memory set to zero.
    845888        void * calloc( size_t dim, size_t elemSize ) {
    846                 size_t size = dim * elemSize;
    847           if ( unlikely( size ) == 0 ) {                        // 0 BYTE ALLOCATION RETURNS NULL POINTER
    848                         #ifdef __STATISTICS__
    849                         __atomic_add_fetch( &calloc_zero_calls, 1, __ATOMIC_SEQ_CST );
    850                         #endif // __STATISTICS__
    851                         return 0p;
    852                 } // if
    853889                #ifdef __STATISTICS__
    854890                __atomic_add_fetch( &calloc_calls, 1, __ATOMIC_SEQ_CST );
     
    856892                #endif // __STATISTICS__
    857893
    858                 char * addr = (char *)mallocNoStats( size );
    859 
    860                 HeapManager.Storage.Header * header;
    861                 HeapManager.FreeHeader * freeElem;
    862                 size_t bsize, alignment;
    863 
    864                 #ifndef __CFA_DEBUG__
    865                 bool mapped =
    866                         #endif // __CFA_DEBUG__
    867                         headers( "calloc", addr, header, freeElem, bsize, alignment );
    868 
    869                 #ifndef __CFA_DEBUG__
    870                 // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
    871                 if ( ! mapped )
    872                 #endif // __CFA_DEBUG__
    873                         // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
    874                         // `-header`-addr                      `-size
    875                         memset( addr, '\0', size );                                     // set to zeros
    876 
    877                 header->kind.real.blockSize |= 2;                               // mark as zero filled
    878                 return addr;
     894                return callocNoStats( dim, elemSize );
    879895        } // calloc
    880896
     
    885901        // call to malloc(), alloc(), calloc() or realloc(). If the area pointed to was moved, a free(oaddr) is done.
    886902        void * resize( void * oaddr, size_t size ) {
     903                #ifdef __STATISTICS__
     904                __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
     905                #endif // __STATISTICS__
     906
    887907                // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
    888           if ( unlikely( size == 0 ) ) {                                        // special cases
    889                         #ifdef __STATISTICS__
    890                         __atomic_add_fetch( &resize_zero_calls, 1, __ATOMIC_SEQ_CST );
    891                         #endif // __STATISTICS__
    892                         free( oaddr );
    893                         return 0p;
    894                 } // if
    895                 #ifdef __STATISTICS__
    896                 __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
    897                 #endif // __STATISTICS__
    898 
     908          if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
    899909          if ( unlikely( oaddr == 0p ) ) {
    900910                        #ifdef __STATISTICS__
     
    908918                size_t bsize, oalign;
    909919                headers( "resize", oaddr, header, freeElem, bsize, oalign );
    910 
    911920                size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
     921
    912922                // same size, DO NOT preserve STICKY PROPERTIES.
    913923                if ( oalign == libAlign() && size <= odsize && odsize <= size * 2 ) { // allow 50% wasted storage for smaller size
     
    930940        // the old and new sizes.
    931941        void * realloc( void * oaddr, size_t size ) {
     942                #ifdef __STATISTICS__
     943                __atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
     944                #endif // __STATISTICS__
     945
    932946                // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
    933           if ( unlikely( size == 0 ) ) {                                        // special cases
    934                         #ifdef __STATISTICS__
    935                         __atomic_add_fetch( &realloc_zero_calls, 1, __ATOMIC_SEQ_CST );
    936                         #endif // __STATISTICS__
    937                         free( oaddr );
    938                         return 0p;
    939                 } // if
    940                 #ifdef __STATISTICS__
    941                 __atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
    942                 #endif // __STATISTICS__
    943 
     947          if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
    944948          if ( unlikely( oaddr == 0p ) ) {
    945949                        #ifdef __STATISTICS__
     
    995999        void * memalign( size_t alignment, size_t size ) {
    9961000                #ifdef __STATISTICS__
    997                 if ( likely( size > 0 ) ) {
    998                         __atomic_add_fetch( &memalign_calls, 1, __ATOMIC_SEQ_CST );
    999                         __atomic_add_fetch( &memalign_storage, size, __ATOMIC_SEQ_CST );
    1000                 } else {
    1001                         __atomic_add_fetch( &memalign_zero_calls, 1, __ATOMIC_SEQ_CST );
    1002                 } // if
     1001                __atomic_add_fetch( &memalign_calls, 1, __ATOMIC_SEQ_CST );
     1002                __atomic_add_fetch( &memalign_storage, size, __ATOMIC_SEQ_CST );
    10031003                #endif // __STATISTICS__
    10041004
     
    10111011                size_t size = dim * elemSize;
    10121012                #ifdef __STATISTICS__
    1013                 if ( likely( size > 0 ) ) {
    1014                         __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
    1015                         __atomic_add_fetch( &cmemalign_storage, size, __ATOMIC_SEQ_CST );
    1016                 } else {
    1017                         __atomic_add_fetch( &cmemalign_zero_calls, 1, __ATOMIC_SEQ_CST );
    1018                 } // if
     1013                __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
     1014                __atomic_add_fetch( &cmemalign_storage, size, __ATOMIC_SEQ_CST );
    10191015                #endif // __STATISTICS__
    10201016
     
    10251021        // Same as calloc() with memory alignment.
    10261022        void * cmemalign( size_t alignment, size_t dim, size_t elemSize ) {
    1027                 size_t size = dim * elemSize;
    1028           if ( unlikely( size ) == 0 ) {                                        // 0 BYTE ALLOCATION RETURNS NULL POINTER
    1029                         #ifdef __STATISTICS__
    1030                         __atomic_add_fetch( &cmemalign_zero_calls, 1, __ATOMIC_SEQ_CST );
    1031                         #endif // __STATISTICS__
    1032                         return 0p;
    1033                 } // if
    10341023                #ifdef __STATISTICS__
    10351024                __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
     
    10371026                #endif // __STATISTICS__
    10381027
    1039                 char * addr = (char *)memalignNoStats( alignment, size );
    1040 
    1041                 HeapManager.Storage.Header * header;
    1042                 HeapManager.FreeHeader * freeElem;
    1043                 size_t bsize;
    1044 
    1045                 #ifndef __CFA_DEBUG__
    1046                 bool mapped =
    1047                         #endif // __CFA_DEBUG__
    1048                         headers( "cmemalign", addr, header, freeElem, bsize, alignment );
    1049 
    1050                 // Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
    1051                 #ifndef __CFA_DEBUG__
    1052                 if ( ! mapped )
    1053                 #endif // __CFA_DEBUG__
    1054                         // <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
    1055                         // `-header`-addr                      `-size
    1056                         memset( addr, '\0', size );                                     // set to zeros
    1057 
    1058                 header->kind.real.blockSize |= 2;                               // mark as zero filled
    1059                 return addr;
     1028                return cmemalignNoStats( alignment, dim, elemSize );
    10601029        } // cmemalign
    10611030
     
    10961065        // 0p, no operation is performed.
    10971066        void free( void * addr ) {
     1067                #ifdef __STATISTICS__
     1068                __atomic_add_fetch( &free_calls, 1, __ATOMIC_SEQ_CST );
     1069                #endif // __STATISTICS__
     1070
    10981071          if ( unlikely( addr == 0p ) ) {                                       // special case
    1099                         #ifdef __STATISTICS__
    1100                         __atomic_add_fetch( &free_zero_calls, 1, __ATOMIC_SEQ_CST );
    1101                         #endif // __STATISTICS__
    1102 
    11031072                        // #ifdef __CFA_DEBUG__
    11041073                        // if ( traceHeap() ) {
     
    12131182        int malloc_stats_fd( int fd __attribute__(( unused )) ) {
    12141183                #ifdef __STATISTICS__
    1215                 int temp = stats_fd;
    1216                 stats_fd = fd;
     1184                int temp = stat_fd;
     1185                stat_fd = fd;
    12171186                return temp;
    12181187                #else
     
    12451214        // The string is printed on the file stream stream.  The exported string includes information about all arenas (see
    12461215        // malloc).
    1247         int malloc_info( int options, FILE * stream __attribute__(( unused )) ) {
     1216        int malloc_info( int options, FILE * stream ) {
    12481217          if ( options != 0 ) { errno = EINVAL; return -1; }
    12491218                #ifdef __STATISTICS__
     
    12741243// Must have CFA linkage to overload with C linkage realloc.
    12751244void * resize( void * oaddr, size_t nalign, size_t size ) {
    1276         // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
    1277   if ( unlikely( size == 0 ) ) {                                                // special cases
    1278                 #ifdef __STATISTICS__
    1279                 __atomic_add_fetch( &resize_zero_calls, 1, __ATOMIC_SEQ_CST );
    1280                 #endif // __STATISTICS__
    1281                 free( oaddr );
    1282                 return 0p;
    1283         } // if
     1245        #ifdef __STATISTICS__
     1246        __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
     1247        #endif // __STATISTICS__
    12841248
    12851249        if ( unlikely( nalign < libAlign() ) ) nalign = libAlign(); // reset alignment to minimum
    12861250        #ifdef __CFA_DEBUG__
    1287         else checkAlign( nalign );                                                      // check alignment
     1251        else
     1252                checkAlign( nalign );                                                   // check alignment
    12881253        #endif // __CFA_DEBUG__
    12891254
     1255        // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
     1256  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
    12901257  if ( unlikely( oaddr == 0p ) ) {
    12911258                #ifdef __STATISTICS__
    1292                 __atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
    12931259                __atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
    12941260                #endif // __STATISTICS__
     
    13361302
    13371303void * realloc( void * oaddr, size_t nalign, size_t size ) {
    1338         // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
    1339   if ( unlikely( size == 0 ) ) {                                                // special cases
    1340                 #ifdef __STATISTICS__
    1341                 __atomic_add_fetch( &realloc_zero_calls, 1, __ATOMIC_SEQ_CST );
    1342                 #endif // __STATISTICS__
    1343                 free( oaddr );
    1344                 return 0p;
    1345         } // if
    1346 
    13471304        if ( unlikely( nalign < libAlign() ) ) nalign = libAlign(); // reset alignment to minimum
    13481305        #ifdef __CFA_DEBUG__
    1349         else checkAlign( nalign );                                                      // check alignment
     1306        else
     1307                checkAlign( nalign );                                                   // check alignment
    13501308        #endif // __CFA_DEBUG__
    13511309
     1310        // If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
     1311  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
    13521312  if ( unlikely( oaddr == 0p ) ) {
    13531313                #ifdef __STATISTICS__
  • libcfa/src/startup.cfa

    r342af53 r4468a70  
    1010// Created On       : Tue Jul 24 16:21:57 2018
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sat Jan  9 23:18:23 2021
    13 // Update Count     : 34
     12// Last Modified On : Tue Feb  4 13:03:18 2020
     13// Update Count     : 30
    1414//
    1515
    16 #include <time.h>                                                                               // tzset
    17 #include <locale.h>                                                                             // setlocale
    18 #include <stdlib.h>                                                                             // getenv
     16#include <time.h>                // tzset
     17#include <locale.h>        // setlocale
    1918#include "startup.hfa"
    2019
     
    2322    void __cfaabi_appready_startup( void ) {
    2423                tzset();                                                                                // initialize time global variables
    25                 setlocale( LC_NUMERIC, getenv("LANG") );
     24                setlocale(LC_NUMERIC, "");
    2625                #ifdef __CFA_DEBUG__
    2726                extern void heapAppStart();
  • src/AST/Decl.cpp

    r342af53 r4468a70  
    1010// Created On       : Thu May 9 10:00:00 2019
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Tue Jan 12 16:54:55 2021
    13 // Update Count     : 23
     12// Last Modified On : Fri Dec 13 16:23:15 2019
     13// Update Count     : 20
    1414//
    1515
     
    7878
    7979const char * TypeDecl::typeString() const {
    80         static const char * kindNames[] = { "sized data type", "sized data type", "sized object type", "sized function type", "sized tuple type", "sized array length type" };
    81         static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
     80        static const char * kindNames[] = { "sized data type", "sized object type", "sized function type", "sized tuple type" };
     81        static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
    8282        assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
    8383        return sized ? kindNames[ kind ] : &kindNames[ kind ][ sizeof("sized") ]; // sizeof includes '\0'
     
    8585
    8686const char * TypeDecl::genTypeString() const {
    87         static const char * kindNames[] = { "T &", "T *", "T", "(*)", "T ...", "[T]" };
    88         static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
     87        static const char * kindNames[] = { "dtype", "otype", "ftype", "ttype" };
     88        static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
    8989        assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
    9090        return kindNames[ kind ];
  • src/AST/Decl.hpp

    r342af53 r4468a70  
    1010// Created On       : Thu May 9 10:00:00 2019
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Jan 11 20:48:38 2021
    13 // Update Count     : 30
     12// Last Modified On : Fri Dec 13 17:38:33 2019
     13// Update Count     : 29
    1414//
    1515
     
    175175class TypeDecl final : public NamedTypeDecl {
    176176  public:
    177         enum Kind { Dtype, DStype, Otype, Ftype, Ttype, ALtype, NUMBER_OF_KINDS };
     177        enum Kind { Dtype, Otype, Ftype, Ttype, NUMBER_OF_KINDS };
    178178
    179179        Kind kind;
  • src/Parser/DeclarationNode.cc

    r342af53 r4468a70  
    1010// Created On       : Sat May 16 12:34:05 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Jan 11 20:58:07 2021
    13 // Update Count     : 1137
     12// Last Modified On : Thu Oct  8 08:03:38 2020
     13// Update Count     : 1135
    1414//
    1515
     
    10751075        if ( variable.tyClass != TypeDecl::NUMBER_OF_KINDS ) {
    10761076                // otype is internally converted to dtype + otype parameters
    1077                 static const TypeDecl::Kind kindMap[] = { TypeDecl::Dtype, TypeDecl::DStype, TypeDecl::Dtype, TypeDecl::Ftype, TypeDecl::Ttype, TypeDecl::ALtype };
    1078                 static_assert( sizeof(kindMap) / sizeof(kindMap[0]) == TypeDecl::NUMBER_OF_KINDS, "DeclarationNode::build: kindMap is out of sync." );
     1077                static const TypeDecl::Kind kindMap[] = { TypeDecl::Dtype, TypeDecl::Dtype, TypeDecl::Ftype, TypeDecl::Ttype };
     1078                static_assert( sizeof(kindMap)/sizeof(kindMap[0]) == TypeDecl::NUMBER_OF_KINDS, "DeclarationNode::build: kindMap is out of sync." );
    10791079                assertf( variable.tyClass < sizeof(kindMap)/sizeof(kindMap[0]), "Variable's tyClass is out of bounds." );
    10801080                TypeDecl * ret = new TypeDecl( *name, Type::StorageClasses(), nullptr, kindMap[ variable.tyClass ], variable.tyClass == TypeDecl::Otype, variable.initializer ? variable.initializer->buildType() : nullptr );
  • src/Parser/ParseNode.h

    r342af53 r4468a70  
    1010// Created On       : Sat May 16 13:28:16 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sun Jan  3 18:23:01 2021
    13 // Update Count     : 896
     12// Last Modified On : Sat Oct 24 03:53:54 2020
     13// Update Count     : 895
    1414//
    1515
     
    3939struct DeclarationNode;
    4040class DeclarationWithType;
     41class ExpressionNode;
    4142class Initializer;
    42 class ExpressionNode;
    4343struct StatementNode;
    4444
  • src/Parser/parser.yy

    r342af53 r4468a70  
    1010// Created On       : Sat Sep  1 20:22:55 2001
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Jan 11 21:32:10 2021
    13 // Update Count     : 4633
     12// Last Modified On : Sat Oct 24 08:21:14 2020
     13// Update Count     : 4624
    1414//
    1515
     
    329329%type<en> conditional_expression                constant_expression                     assignment_expression           assignment_expression_opt
    330330%type<en> comma_expression                              comma_expression_opt
    331 %type<en> argument_expression_list_opt  argument_expression                     default_initializer_opt
     331%type<en> argument_expression_list_opt  argument_expression                     default_initialize_opt
    332332%type<ifctl> if_control_expression
    333333%type<fctl> for_control_expression              for_control_expression_list
     
    424424%type<decl> sue_declaration_specifier sue_declaration_specifier_nobody sue_type_specifier sue_type_specifier_nobody
    425425
    426 %type<tclass> type_class new_type_class
     426%type<tclass> type_class
    427427%type<decl> type_declarator type_declarator_name type_declaring_list
    428428
     
    15451545        | cfa_function_declaration
    15461546        | type_declaring_list
    1547                 { SemanticError( yylloc, "otype declaration is currently unimplemented." ); $$ = nullptr; }
    15481547        | trait_specifier
    15491548        ;
     
    22242223        ;
    22252224
    2226 cfa_parameter_ellipsis_list_opt:                                                // CFA, abstract + real
     2225cfa_parameter_ellipsis_list_opt:                                                        // CFA, abstract + real
    22272226        // empty
    22282227                { $$ = DeclarationNode::newBasicType( DeclarationNode::Void ); }
     
    22812280cfa_parameter_declaration:                                                              // CFA, new & old style parameter declaration
    22822281        parameter_declaration
    2283         | cfa_identifier_parameter_declarator_no_tuple identifier_or_type_name default_initializer_opt
     2282        | cfa_identifier_parameter_declarator_no_tuple identifier_or_type_name default_initialize_opt
    22842283                { $$ = $1->addName( $2 ); }
    2285         | cfa_abstract_tuple identifier_or_type_name default_initializer_opt
     2284        | cfa_abstract_tuple identifier_or_type_name default_initialize_opt
    22862285                // To obtain LR(1), these rules must be duplicated here (see cfa_abstract_declarator).
    22872286                { $$ = $1->addName( $2 ); }
    2288         | type_qualifier_list cfa_abstract_tuple identifier_or_type_name default_initializer_opt
     2287        | type_qualifier_list cfa_abstract_tuple identifier_or_type_name default_initialize_opt
    22892288                { $$ = $2->addName( $3 )->addQualifiers( $1 ); }
    22902289        | cfa_function_specifier
     
    23032302parameter_declaration:
    23042303                // No SUE declaration in parameter list.
    2305         declaration_specifier_nobody identifier_parameter_declarator default_initializer_opt
     2304        declaration_specifier_nobody identifier_parameter_declarator default_initialize_opt
    23062305                { $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
    2307         | declaration_specifier_nobody type_parameter_redeclarator default_initializer_opt
     2306        | declaration_specifier_nobody type_parameter_redeclarator default_initialize_opt
    23082307                { $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
    23092308        ;
    23102309
    23112310abstract_parameter_declaration:
    2312         declaration_specifier_nobody default_initializer_opt
     2311        declaration_specifier_nobody default_initialize_opt
    23132312                { $$ = $1->addInitializer( $2 ? new InitializerNode( $2 ) : nullptr ); }
    2314         | declaration_specifier_nobody abstract_parameter_declarator default_initializer_opt
     2313        | declaration_specifier_nobody abstract_parameter_declarator default_initialize_opt
    23152314                { $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
    23162315        ;
     
    24422441        type_class identifier_or_type_name
    24432442                { typedefTable.addToScope( *$2, TYPEDEFname, "9" ); }
    2444           type_initializer_opt assertion_list_opt
     2443        type_initializer_opt assertion_list_opt
    24452444                { $$ = DeclarationNode::newTypeParam( $1, $2 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
    2446         | identifier_or_type_name new_type_class
    2447                 { typedefTable.addToScope( *$1, TYPEDEFname, "9" ); }
    2448           type_initializer_opt assertion_list_opt
    2449                 { $$ = DeclarationNode::newTypeParam( $2, $1 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
    2450         | '[' identifier_or_type_name ']'
    2451                 {
    2452                         typedefTable.addToScope( *$2, TYPEDEFname, "9" );
    2453                         $$ = DeclarationNode::newTypeParam( TypeDecl::ALtype, $2 );
    2454                 }
    2455         // | type_specifier identifier_parameter_declarator
     2445        | type_specifier identifier_parameter_declarator
    24562446        | assertion_list
    24572447                { $$ = DeclarationNode::newTypeParam( TypeDecl::Dtype, new string( DeclarationNode::anonymous.newName() ) )->addAssertions( $1 ); }
    2458         ;
    2459 
    2460 new_type_class:                                                                                 // CFA
    2461         // empty
    2462                 { $$ = TypeDecl::Otype; }
    2463         | '&'
    2464                 { $$ = TypeDecl::Dtype; }
    2465         | '*'
    2466                 { $$ = TypeDecl::DStype; }                                              // dtype + sized
    2467         | ELLIPSIS
    2468                 { $$ = TypeDecl::Ttype; }
    24692448        ;
    24702449
     
    34973476        ;
    34983477
    3499 default_initializer_opt:
     3478default_initialize_opt:
    35003479        // empty
    35013480                { $$ = nullptr; }
  • src/ResolvExpr/ResolveAssertions.cc

    r342af53 r4468a70  
    397397
    398398        /// Limit to depth of recursion of assertion satisfaction
    399         static const int recursionLimit = 7;
     399        static const int recursionLimit = 4;
    400400        /// Maximum number of simultaneously-deferred assertions to attempt concurrent satisfaction of
    401401        static const int deferLimit = 10;
  • src/ResolvExpr/SatisfyAssertions.cpp

    r342af53 r4468a70  
    5757                ast::UniqueId resnSlot;          ///< Slot for any recursive assertion IDs
    5858
    59                 AssnCandidate(
    60                         const ast::SymbolTable::IdData c, const ast::Type * at, ast::TypeEnvironment && e,
     59                AssnCandidate( 
     60                        const ast::SymbolTable::IdData c, const ast::Type * at, ast::TypeEnvironment && e, 
    6161                        ast::AssertionSet && h, ast::AssertionSet && n, ast::OpenVarSet && o, ast::UniqueId rs )
    62                 : cdata( c ), adjType( at ), env( std::move( e ) ), have( std::move( h ) ),
     62                : cdata( c ), adjType( at ), env( std::move( e ) ), have( std::move( h ) ), 
    6363                  need( std::move( n ) ), open( std::move( o ) ), resnSlot( rs ) {}
    6464        };
     
    7373                const AssnCandidate & match;
    7474        };
    75 
    76         /// Wrapper for the deferred items from a single assertion satisfaction.
     75       
     76        /// Wrapper for the deferred items from a single assertion satisfaction. 
    7777        /// Acts like an indexed list of DeferRef
    7878        struct DeferItem {
     
    8181                AssnCandidateList matches;
    8282
    83                 DeferItem(
     83                DeferItem( 
    8484                        const ast::VariableExpr * d, const ast::AssertionSetValue & i, AssnCandidateList && ms )
    8585                : expr( d ), info( i ), matches( std::move( ms ) ) {}
     
    117117                /// Initial satisfaction state for a candidate
    118118                SatState( CandidateRef & c, const ast::SymbolTable & syms )
    119                 : cand( c ), need(), newNeed(), deferred(), inferred(), costs{ Cost::zero },
     119                : cand( c ), need(), newNeed(), deferred(), inferred(), costs{ Cost::zero }, 
    120120                  symtab( syms ) { need.swap( c->need ); }
    121 
     121               
    122122                /// Update satisfaction state for next step after previous state
    123123                SatState( SatState && o, IterateFlag )
    124                 : cand( std::move( o.cand ) ), need( o.newNeed.begin(), o.newNeed.end() ), newNeed(),
    125                   deferred(), inferred( std::move( o.inferred ) ), costs( std::move( o.costs ) ),
     124                : cand( std::move( o.cand ) ), need( o.newNeed.begin(), o.newNeed.end() ), newNeed(), 
     125                  deferred(), inferred( std::move( o.inferred ) ), costs( std::move( o.costs ) ), 
    126126                  symtab( o.symtab ) { costs.emplace_back( Cost::zero ); }
    127 
     127               
    128128                /// Field-wise next step constructor
    129129                SatState(
    130                         CandidateRef && c, ast::AssertionSet && nn, InferCache && i, CostVec && cs,
     130                        CandidateRef && c, ast::AssertionSet && nn, InferCache && i, CostVec && cs, 
    131131                        ast::SymbolTable && syms )
    132                 : cand( std::move( c ) ), need( nn.begin(), nn.end() ), newNeed(), deferred(),
     132                : cand( std::move( c ) ), need( nn.begin(), nn.end() ), newNeed(), deferred(), 
    133133                  inferred( std::move( i ) ), costs( std::move( cs ) ), symtab( std::move( syms ) )
    134134                  { costs.emplace_back( Cost::zero ); }
     
    143143
    144144        /// Binds a single assertion, updating satisfaction state
    145         void bindAssertion(
    146                 const ast::VariableExpr * expr, const ast::AssertionSetValue & info, CandidateRef & cand,
     145        void bindAssertion( 
     146                const ast::VariableExpr * expr, const ast::AssertionSetValue & info, CandidateRef & cand, 
    147147                AssnCandidate & match, InferCache & inferred
    148148        ) {
    149149                const ast::DeclWithType * candidate = match.cdata.id;
    150                 assertf( candidate->uniqueId,
     150                assertf( candidate->uniqueId, 
    151151                        "Assertion candidate does not have a unique ID: %s", toString( candidate ).c_str() );
    152 
     152               
    153153                ast::Expr * varExpr = match.cdata.combine( cand->expr->location, cand->cvtCost );
    154154                varExpr->result = match.adjType;
     
    201201                        ast::OpenVarSet newOpen{ sat.cand->open };
    202202                        ast::ptr< ast::Type > toType = assn.first->result;
    203                         ast::ptr< ast::Type > adjType =
     203                        ast::ptr< ast::Type > adjType = 
    204204                                renameTyVars( adjustExprType( candidate->get_type(), newEnv, sat.symtab ), GEN_USAGE, false );
    205205
     
    213213                                }
    214214
    215                                 matches.emplace_back(
     215                                matches.emplace_back( 
    216216                                        cdata, adjType, std::move( newEnv ), std::move( have ), std::move( newNeed ),
    217217                                        std::move( newOpen ), crntResnSlot );
     
    287287        };
    288288
    289         /// Replace ResnSlots with InferParams and add alternative to output list, if it meets pruning
     289        /// Replace ResnSlots with InferParams and add alternative to output list, if it meets pruning 
    290290        /// threshold.
    291         void finalizeAssertions(
    292                 CandidateRef & cand, InferCache & inferred, PruneMap & thresholds, CostVec && costs,
    293                 CandidateList & out
     291        void finalizeAssertions( 
     292                CandidateRef & cand, InferCache & inferred, PruneMap & thresholds, CostVec && costs, 
     293                CandidateList & out 
    294294        ) {
    295295                // prune if cheaper alternative for same key has already been generated
     
    308308        }
    309309
    310         /// Combo iterator that combines candidates into an output list, merging their environments.
    311         /// Rejects an appended candidate if environments cannot be merged. See `Common/FilterCombos.h`
     310        /// Combo iterator that combines candidates into an output list, merging their environments. 
     311        /// Rejects an appended candidate if environments cannot be merged. See `Common/FilterCombos.h` 
    312312        /// for description of "combo iterator".
    313313        class CandidateEnvMerger {
     
    390390} // anonymous namespace
    391391
    392 void satisfyAssertions(
    393         CandidateRef & cand, const ast::SymbolTable & symtab, CandidateList & out,
     392void satisfyAssertions( 
     393        CandidateRef & cand, const ast::SymbolTable & symtab, CandidateList & out, 
    394394        std::vector<std::string> & errors
    395395) {
     
    418418
    419419                        // should a limit be imposed? worst case here is O(n^2) but very unlikely to happen.
    420                         for (unsigned resetCount = 0; ; ++resetCount) {
     420                        for (unsigned resetCount = 0; ; ++resetCount) { 
    421421                                ast::AssertionList next;
    422422                                resetTyVarRenaming();
     
    449449                                // either add successful match or push back next state
    450450                                if ( sat.newNeed.empty() ) {
    451                                         finalizeAssertions(
     451                                        finalizeAssertions( 
    452452                                                sat.cand, sat.inferred, thresholds, std::move( sat.costs ), out );
    453453                                } else {
     
    471471                                std::vector< CandidateEnvMerger::OutType > compatible = filterCombos(
    472472                                        sat.deferred, CandidateEnvMerger{ sat.cand->env, sat.cand->open, sat.symtab } );
    473 
     473                               
    474474                                // fail early if no mutually-compatible assertion satisfaction
    475475                                if ( compatible.empty() ) {
     
    494494                                        // set up next satisfaction state
    495495                                        CandidateRef nextCand = std::make_shared<Candidate>(
    496                                                 sat.cand->expr, std::move( compat.env ), std::move( compat.open ),
     496                                                sat.cand->expr, std::move( compat.env ), std::move( compat.open ), 
    497497                                                ast::AssertionSet{} /* need moved into satisfaction state */,
    498498                                                sat.cand->cost, sat.cand->cvtCost );
     
    500500                                        ast::AssertionSet nextNewNeed{ sat.newNeed };
    501501                                        InferCache nextInferred{ sat.inferred };
    502 
     502                                       
    503503                                        CostVec nextCosts{ sat.costs };
    504504                                        nextCosts.back() += compat.cost;
    505 
     505                                                               
    506506                                        ast::SymbolTable nextSymtab{ sat.symtab };
    507507
     
    517517                                        // either add successful match or push back next state
    518518                                        if ( nextNewNeed.empty() ) {
    519                                                 finalizeAssertions(
     519                                                finalizeAssertions( 
    520520                                                        nextCand, nextInferred, thresholds, std::move( nextCosts ), out );
    521521                                        } else {
    522                                                 nextSats.emplace_back(
    523                                                         std::move( nextCand ), std::move( nextNewNeed ),
    524                                                         std::move( nextInferred ), std::move( nextCosts ),
     522                                                nextSats.emplace_back( 
     523                                                        std::move( nextCand ), std::move( nextNewNeed ), 
     524                                                        std::move( nextInferred ), std::move( nextCosts ), 
    525525                                                        std::move( nextSymtab ) );
    526526                                        }
     
    534534                nextSats.clear();
    535535        }
    536 
     536       
    537537        // exceeded recursion limit if reaches here
    538538        if ( out.empty() ) {
  • src/SymTab/Demangle.cc

    r342af53 r4468a70  
    1010// Created On       : Thu Jul 19 12:52:41 2018
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Jan 11 21:28:27 2021
    13 // Update Count     : 11
     12// Last Modified On : Tue Feb 11 15:09:18 2020
     13// Update Count     : 10
    1414//
    1515
     
    367367                                // type variable types
    368368                                for (size_t k = 0; k < TypeDecl::NUMBER_OF_KINDS; ++k) {
    369                                         static const std::string typeVariableNames[] = { "DT", "DST", "OT", "FT", "TT", "ALT", };
     369                                        static const std::string typeVariableNames[] = { "DT", "OT", "FT", "TT", };
    370370                                        static_assert(
    371371                                                sizeof(typeVariableNames)/sizeof(typeVariableNames[0]) == TypeDecl::NUMBER_OF_KINDS,
  • src/SymTab/Mangler.cc

    r342af53 r4468a70  
    1010// Created On       : Sun May 17 21:40:29 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Jan 11 21:56:06 2021
    13 // Update Count     : 74
     12// Last Modified On : Wed Nov 18 12:01:38 2020
     13// Update Count     : 64
    1414//
    1515#include "Mangler.h"
     
    313313                                // and the case has not yet come up in practice. Alternatively, if not then this code can be removed
    314314                                // aside from the assert false.
    315                                 assertf( false, "Mangler_old should not visit typedecl: %s", toCString(decl));
     315                                assertf(false, "Mangler_old should not visit typedecl: %s", toCString(decl));
    316316                                assertf( decl->kind < TypeDecl::NUMBER_OF_KINDS, "Unhandled type variable kind: %d", decl->kind );
    317317                                mangleName += Encoding::typeVariables[ decl->kind ] + std::to_string( decl->name.length() ) + decl->name;
     
    343343                                                        break;
    344344                                                  default:
    345                                                         assertf( false, "unimplemented kind for type variable %s", SymTab::Mangler::Encoding::typeVariables[i->kind].c_str() );
     345                                                        assert( false );
    346346                                                } // switch
    347347                                                varNums[ i->name ] = std::make_pair( nextVarNum, (int)i->kind );
     
    673673                                        for ( auto & decl : ptype->forall ) {
    674674                                                switch ( decl->kind ) {
    675                                                   case ast::TypeDecl::Kind::Dtype:
     675                                                case ast::TypeDecl::Kind::Dtype:
    676676                                                        dcount++;
    677677                                                        break;
    678                                                   case ast::TypeDecl::Kind::Ftype:
     678                                                case ast::TypeDecl::Kind::Ftype:
    679679                                                        fcount++;
    680680                                                        break;
    681                                                   case ast::TypeDecl::Kind::Ttype:
     681                                                case ast::TypeDecl::Kind::Ttype:
    682682                                                        vcount++;
    683683                                                        break;
    684                                                   default:
    685                                                         assertf( false, "unimplemented kind for type variable %s", SymTab::Mangler::Encoding::typeVariables[decl->kind].c_str() );
     684                                                default:
     685                                                        assert( false );
    686686                                                } // switch
    687687                                                varNums[ decl->name ] = std::make_pair( nextVarNum, (int)decl->kind );
  • src/SymTab/ManglerCommon.cc

    r342af53 r4468a70  
    1010// Created On       : Sun May 17 21:44:03 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Jan 11 21:23:10 2021
    13 // Update Count     : 29
     12// Last Modified On : Fri Dec 13 14:54:38 2019
     13// Update Count     : 28
    1414//
    1515
     
    104104                        const std::string typeVariables[] = {
    105105                                "BD", // dtype
    106                                 "BDS", // dtype + sized
    107106                                "BO", // otype
    108107                                "BF", // ftype
    109108                                "BT", // ttype
    110                                 "BAL", // array length type
    111109                        };
    112110                        static_assert(
    113                                 sizeof(typeVariables) / sizeof(typeVariables[0]) == TypeDecl::NUMBER_OF_KINDS,
     111                                sizeof(typeVariables)/sizeof(typeVariables[0]) == TypeDecl::NUMBER_OF_KINDS,
    114112                                "Each type variable kind should have a corresponding mangler prefix"
    115113                        );
  • src/SynTree/Declaration.h

    r342af53 r4468a70  
    1010// Created On       : Mon May 18 07:44:20 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Mon Jan 11 20:48:39 2021
    13 // Update Count     : 158
     12// Last Modified On : Fri Dec 13 23:11:22 2019
     13// Update Count     : 157
    1414//
    1515
     
    201201        typedef NamedTypeDecl Parent;
    202202  public:
    203         enum Kind { Dtype, DStype, Otype, Ftype, Ttype, ALtype, NUMBER_OF_KINDS };
     203        enum Kind { Dtype, Otype, Ftype, Ttype, NUMBER_OF_KINDS };
    204204
    205205        Kind kind;
  • src/SynTree/TypeDecl.cc

    r342af53 r4468a70  
    1010// Created On       : Mon May 18 07:44:20 2015
    1111// Last Modified By : Peter A. Buhr
    12 // Last Modified On : Tue Jan 12 16:07:33 2021
    13 // Update Count     : 26
     12// Last Modified On : Thu Oct  8 18:18:55 2020
     13// Update Count     : 22
    1414//
    1515
     
    3333
    3434const char * TypeDecl::typeString() const {
    35         static const char * kindNames[] = { "sized data type", "sized data type", "sized object type", "sized function type", "sized tuple type", "sized array length type" };
    36         static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
     35        static const char * kindNames[] = { "sized data type", "sized object type", "sized function type", "sized tuple type" };
     36        static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
    3737        assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
    3838        return isComplete() ? kindNames[ kind ] : &kindNames[ kind ][ sizeof("sized") ]; // sizeof includes '\0'
     
    4040
    4141const char * TypeDecl::genTypeString() const {
    42         static const char * kindNames[] = { "T &", "T *", "T", "(*)", "T ...", "[T]" };
    43         static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
     42        static const char * kindNames[] = { "dtype", "otype", "ftype", "ttype" };
     43        static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
    4444        assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
    4545        return kindNames[ kind ];
  • tests/Makefile.am

    r342af53 r4468a70  
    103103
    104104mostlyclean-local :
    105         find ${builddir} -not -path './__pycache__/*' -path '*.o' -delete
    106         find ${builddir} -not -path './__pycache__/*' -path '*/.err/*.log' -delete
    107         find ${builddir} -not -path './__pycache__/*' -path '*/.out/*.log' -delete
    108105        rm -f ${EXTRA_PROGRAMS}
    109106        rm -rf __pycache__
     107        find ${builddir} -path '*.o' -delete
     108        find ${builddir} -path '*/.err/*.log' -delete
     109        find ${builddir} -path '*/.out/*.log' -delete
    110110
    111111distclean-local :
  • tests/concurrent/futures/.expect/basic.txt

    r342af53 r4468a70  
    1 start
    21done
  • tests/concurrent/futures/basic.cfa

    r342af53 r4468a70  
    11#include <thread.hfa>
    2 
    32enum {NFUTURES = 10};
    43
     
    8483
    8584int main() {
    86         printf( "start\n" );                            // non-empty .expect file
    8785        processor procs[2];
    8886        {
Note: See TracChangeset for help on using the changeset viewer.