Changeset 23a08aa0


Ignore:
Timestamp:
Sep 19, 2022, 8:11:02 PM (3 years ago)
Author:
Peter A. Buhr <pabuhr@…>
Branches:
ADT, ast-experimental, master, pthread-emulation
Children:
aa9f215
Parents:
ebf8ca5 (diff), ae1d151 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

fix merge conflict

Files:
3 added
47 deleted
95 edited
36 moved

Legend:

Unmodified
Added
Removed
  • Jenkins/FullBuild

    rebf8ca5 r23a08aa0  
    1818
    1919                                parallel (
    20                                         // gcc_08_x86_new: { trigger_build( 'gcc-8',   'x86' ) },
    21                                         // gcc_07_x86_new: { trigger_build( 'gcc-7',   'x86' ) },
    22                                         // gcc_06_x86_new: { trigger_build( 'gcc-6',   'x86' ) },
     20                                        gcc_08_x86_new: { trigger_build( 'gcc-10',  'x86' ) },
     21                                        gcc_07_x86_new: { trigger_build( 'gcc-9',   'x86' ) },
    2322                                        gcc_10_x64_new: { trigger_build( 'gcc-10',  'x64' ) },
    2423                                        gcc_09_x64_new: { trigger_build( 'gcc-9',   'x64' ) },
  • Jenkinsfile

    rebf8ca5 r23a08aa0  
    150150                sh 'ulimit -a'
    151151
    152                 Tools.BuildStage('Test: short', !Settings.RunAllTests) {
     152                jopt = '-j $(nproc)'
     153
     154                Tools.BuildStage('Test: Debug', true) {
    153155                        dir (BuildDir) {
    154156                                //Run the tests from the tests directory
    155                                 sh "make --no-print-directory -C tests archiveerrors=${BuildDir}/tests/crashes/short"
     157                                sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=yes archiveerrors=${BuildDir}/tests/crashes/full-debug"""
    156158                        }
    157159                }
    158160
    159                 Tools.BuildStage('Test: full', Settings.RunAllTests) {
     161                Tools.BuildStage('Test: Release', Settings.RunAllTests) {
    160162                        dir (BuildDir) {
    161                                         jopt = '-j $(nproc)'
    162                                         if( Settings.Architecture.node == 'x86' ) {
    163                                                 jopt = '-j2'
    164                                         }
    165                                         //Run the tests from the tests directory
    166                                         sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=yes archiveerrors=${BuildDir}/tests/crashes/full-debug"""
    167                                         sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=no  archiveerrors=${BuildDir}/tests/crashes/full-nodebug"""
     163                                //Run the tests from the tests directory
     164                                sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=no  archiveerrors=${BuildDir}/tests/crashes/full-nodebug"""
    168165                        }
    169166                }
     
    384381                                ],                                                                                              \
    385382                                [$class: 'BooleanParameterDefinition',                                                  \
    386                                         description: 'If false, only the quick test suite is ran',              \
     383                                        description: 'If false, the test suite is only ran in debug',   \
    387384                                        name: 'RunAllTests',                                                            \
    388385                                        defaultValue: false,                                                            \
  • benchmark/basic/tls_fetch_add.c

    rebf8ca5 r23a08aa0  
    77// thread_local Boolean. This means the entire protocol is just to "mov" instructions making it extremely cheap.
    88
    9 #define thread_local _Thread_local
    10 
    11 thread_local volatile bool value;
     9__thread volatile bool value;
    1210
    1311void __attribute__((noinline)) do_call() {
  • benchmark/io/http/worker.cfa

    rebf8ca5 r23a08aa0  
    145145                if( options.log ) mutex(sout) sout | "=== Accepting connection ===";
    146146                int fd = cfa_accept4( this.sockfd, this.[addr, addrlen, flags], CFA_IO_LAZY );
    147                 if(fd < 0) {
     147                if(fd <= 0) {
    148148                        if( errno == ECONNABORTED ) break;
    149149                        if( this.done && (errno == EINVAL || errno == EBADF) ) break;
    150                         abort( "accept error: (%d) %s\n", (int)errno, strerror(errno) );
     150                        abort( "accept error %d: (%d) %s\n", fd, (int)errno, strerror(errno) );
    151151                }
    152152                if(this.done) break;
  • configure.ac

    rebf8ca5 r23a08aa0  
    2424#Trasforming cc1 will break compilation
    2525M4CFA_PROGRAM_NAME
    26 
    27 #==============================================================================
    28 # New AST toggling support
    29 AH_TEMPLATE([CFA_USE_NEW_AST],[Sets whether or not to use the new-ast, this is adefault value and can be overrided by --old-ast and --new-ast])
    30 DEFAULT_NEW_AST="True"
    31 AC_ARG_ENABLE(new-ast,
    32         [  --enable-new-ast     whether or not to use new ast as the default AST algorithm],
    33         [case "${enableval}" in
    34                 yes) newast=true ; DEFAULT_NEW_AST="True"  ;;
    35                 no)  newast=false; DEFAULT_NEW_AST="False" ;;
    36                 *) AC_MSG_ERROR([bad value ${enableval} for --enable-new-ast]) ;;
    37         esac],[newast=true])
    38 AC_DEFINE_UNQUOTED([CFA_USE_NEW_AST], $newast)
    39 AC_SUBST(DEFAULT_NEW_AST)
    4026
    4127#==============================================================================
     
    139125                \'--enable-gprofiler=*) ;;
    140126                \'--disable-gprofiler) ;;
    141 
    142                 # skip the target hosts
    143                 \'--enable-new-ast=*) ;;
    144                 \'--disable-new-ast) ;;
    145127
    146128                # skip this, it only causes problems
  • doc/LaTeXmacros/lstlang.sty

    rebf8ca5 r23a08aa0  
    118118                inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or,
    119119                otype, restrict, __restrict, __restrict__, recover, report, __signed, __signed__, _Static_assert, suspend,
    120                 thread, _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,
     120                thread, __thread, _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,
    121121                virtual, __volatile, __volatile__, waitfor, when, with, zero_t,
    122122    },
  • doc/bibliography/pl.bib

    rebf8ca5 r23a08aa0  
    37573757    series      = {Innovative Technology},
    37583758    year        = 1991,
     3759}
     3760
     3761@mastersthesis{Zulfiqar22,
     3762    keywords    = {Cforall, memory allocation, threading},
     3763    contributer = {pabuhr@plg},
     3764    author      = {Mubeen Zulfiqar},
     3765    title       = {High-Performance Concurrent Memory Allocation},
     3766    school      = {School of Computer Science, University of Waterloo},
     3767    year        = 2022,
     3768    address     = {Waterloo, Ontario, Canada, N2L 3G1},
     3769    note        = {\href{https://uwspace.uwaterloo.ca/handle/10012/18329}{https://\-uwspace.uwaterloo.ca/\-handle/\-10012/18329}},
    37593770}
    37603771
  • doc/proposals/iterators.md

    rebf8ca5 r23a08aa0  
    5858returns a range object, which can be used as any other type.
    5959
     60It might not cover every single case with the same syntax (the `@` syntax may
     61not translate to operators very well), but should be able to maintain every
     62option with some library range.
     63
    6064Library Enhancements
    6165--------------------
     
    8286------------
    8387Python has a robust iterator tool set. It also has a `range` built-in which
    84 does many of the same things as the special for loops.
     88does many of the same things as the special for loops (the finite and
     89half-open ranges).
     90
     91In addition, it has many dedicated iterator constructors and transformers,
     92and many containers can both produce and be constructed from iterators.
    8593
    8694+   https://docs.python.org/3/reference/datamodel.html#object.__iter__
    8795+   https://docs.python.org/3/library/functions.html#func-range
    8896
    89 C++ has many iterator tools at well, except for the fact it's `iterators` are
     97C++ has many iterator tools at well, except for the fact it's "iterators" are
    9098not what are usually called iterators (as above) but rather an abstraction of
    91 pointers.
     99pointers. The notable missing feature is that a single iterator has no
     100concept of being empty or not, instead it must be compared to the end
     101iterator.
     102
     103However, C++ ranges have an interface much more similar to iterators.
     104They do appear to be a wrapper around the "pointer" iterators.
     105
     106+   https://en.cppreference.com/w/cpp/ranges
    92107
    93108Rust also has a imperative implementation of a functional style of iterators,
  • doc/theses/thierry_delisle_PhD/.gitignore

    rebf8ca5 r23a08aa0  
    2020thesis/fig/*.fig.bak
    2121thesis/thesis.pdf
     22thesis/thesis.tty
    2223thesis/thesis.ps
    2324
  • doc/theses/thierry_delisle_PhD/thesis/Makefile

    rebf8ca5 r23a08aa0  
    1111LaTeX  = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error -output-directory=${Build}
    1212BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex
     13DeTeX = TEXINPUTS=${TeXLIB} && export TEXINPUTS && detex -r
    1314
    1415MAKEFLAGS = --no-print-directory # --silent
     
    144145        ${LaTeX} $<
    145146
     147%.tty: build/%.dvi
     148        dvi2tty -w132 $< > $@
     149
    146150## Define the default recipes.
    147151
     
    190194churn_jax_ops_FLAGS = --MaxY=50000000
    191195churn_low_jax_ops_FLAGS = --MaxY=50000000
    192 churn_jax_ns_FLAGS = --MaxY=20000
    193 churn_low_jax_ns_FLAGS = --MaxY=20000
     196churn_jax_ns_FLAGS = --MaxY=10000
     197churn_low_jax_ns_FLAGS = --MaxY=10000
    194198
    195199churn_nasus_ops_FLAGS = --MaxY=75000000
    196200churn_low_nasus_ops_FLAGS = --MaxY=75000000
    197 churn_nasus_ns_FLAGS = --MaxY=20000
    198 churn_low_nasus_ns_FLAGS = --MaxY=20000
     201churn_nasus_ns_FLAGS = --MaxY=5000
     202churn_low_nasus_ns_FLAGS = --MaxY=5000
     203
     204locality_share_jax_ops_FLAGS = --MaxY=40000000
     205locality_noshare_jax_ops_FLAGS = --MaxY=40000000
     206locality_share_jax_ns_FLAGS = --MaxY=10000
     207locality_noshare_jax_ns_FLAGS = --MaxY=10000
     208
     209locality_share_nasus_ops_FLAGS = --MaxY=60000000
     210locality_noshare_nasus_ops_FLAGS = --MaxY=60000000
     211locality_share_nasus_ns_FLAGS = --MaxY=10000
     212locality_noshare_nasus_ns_FLAGS = --MaxY=10000
    199213
    200214build/result.%.ns.svg : data/% Makefile ../../../../benchmark/plot.py | ${Build}
  • doc/theses/thierry_delisle_PhD/thesis/glossary.tex

    rebf8ca5 r23a08aa0  
    1414% Definitions
    1515
    16 \longnewglossaryentry{thrd}
    17 {name={thread}}
     16\longnewglossaryentry{at}
     17{name={Thread},text={thread}}
    1818{
    19 Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
     19A thread is an independent sequential execution path through a program. Each thread is scheduled for execution separately and independently from other threads. Systems offer one or more concrete implementations of this concept, \eg \gls{kthrd}, \gls{job}, task. However, most of the concepts of scheduling are independent of the particular implementations of the thread representation. For this reason, this document uses the term \gls{at} to mean any of these representation that meets the general definition.
    2020
    21 \textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
     21\textit{Synonyms : Tasks, Jobs, Blocks.}
    2222}
    2323
    2424\longnewglossaryentry{proc}
    25 {name={processor}}
     25{name={Processor},text={processor}}
    2626{
     27Entity that executes a \gls{at}, \ie the resource being scheduled by the scheduler. In kernel-level threading, \ats are kernel threads and \procs are the \glspl{hthrd} on which the kernel threads are scheduled. In user-level threading and thread pools, \procs are kernel threads.
    2728
     29\textit{Synonyms : Server, Worker.}
    2830}
    2931
    3032\longnewglossaryentry{rQ}
    31 {name={ready-queue}}
     33{name={Ready Queue}, text={ready-queue}}
    3234{
    33 
     35Data structure holding \ats that are ready to \glslink{atrun}{run}. Often a \glsxtrshort{fifo} queue for fairness, but can take many different forms, \eg binary tree and priority queue are also common.
    3436}
    3537
    3638\longnewglossaryentry{uthrding}
    37 {name={user-level threading}}
     39{name={User-Level Threading},text={user-level threading}}
    3840{
    39 
     41Threading model where a scheduler runs in users space and maps threads managed and created inside the user-space onto \glspl{kthrd}.
    4042
    4143\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
     
    4345
    4446\longnewglossaryentry{rmr}
    45 {name={remote memory reference}}
     47{name={Remote Memory Reference},text={remote memory reference}}
    4648{
    47 
     49A memory reference to an address not in the current \gls{hthrd}'s cache is a remote reference. Memory references that \emph{are} in the current \gls{hthrd}'s cache is a \newterm{local} memory reference. For example, a cache line that must be updated from the any cache on another socket, or from RAM in a \glsxtrshort{numa} context.
    4850}
    4951
     
    5153
    5254\longnewglossaryentry{hthrd}
    53 {name={hardware thread}}
     55{name={Hardware Threading},text={hardware thread}}
    5456{
    55 Threads representing the underlying hardware directly, \eg the CPU core, or hyper-thread if the hardware supports multiple threads of execution per core. The number of hardware threads is considered to be always fixed to a specific number determined by the hardware.
     57Threads representing the underlying hardware, \eg a CPU core or hyper-thread, if the hardware supports multiple threads of execution per core. The number of hardware threads present is fixed on any given computer.
    5658
    57 \textit{Synonyms : }
     59\textit{Synonyms : Core, Hyper-Thread, Processing Unit, CPU.}
    5860}
    5961
    6062\longnewglossaryentry{kthrd}
    61 {name={kernel-level thread}}
     63{name={Kernel-Level Thread},text={kernel-level thread}}
    6264{
    63 Threads created and managed inside kernel-space. Each thread has its own stack and its own thread of execution. Kernel-level threads are owned, managed and scheduled by the underlying operating system.
     65Threads created and managed inside kernel space. Each kernel thread has its own stack and its own thread of execution. Kernel-level threads are owned, managed and scheduled by the underlying operating system.
    6466
    6567\textit{Synonyms : OS threads, Hardware threads, Physical threads.}
     
    6769
    6870\longnewglossaryentry{fiber}
    69 {name={fiber}}
     71{name={Fiber},text={fiber}}
    7072{
    71 Fibers are non-preemptive user-level threads. They share most of the caracteristics of user-level threads except that they cannot be preempted by another fiber.
     73Fibers are non-preemptive user-level threads. They share most of the characteristics of user-level threads except that they cannot be preempted by another fiber.
    7274
    7375\textit{Synonyms : Tasks.}
     
    7577
    7678\longnewglossaryentry{job}
    77 {name={job}}
     79{name={Job},text={job}}
    7880{
    7981Unit of work, often sent to a thread pool or worker pool to be executed. Has neither its own stack nor its own thread of execution.
     
    8385
    8486\longnewglossaryentry{pool}
    85 {name={thread-pool}}
     87{name={Thread Pool},text={thread-pool}}
    8688{
    87 Group of homogeneuous threads that loop executing units of works after another.
     89Group of homogeneous threads that loop executing units of works. Often executing \glspl{jobs}.
    8890
    89 \textit{Synonyms : }
     91\textit{Synonyms : Executor.}
    9092}
    9193
    9294\longnewglossaryentry{preemption}
    93 {name={preemption}}
     95{name={Preemption},text={preemption}}
    9496{
    9597Involuntary context switch imposed on threads at a given rate.
     
    98100}
    99101
    100 
    101 
    102 \longnewglossaryentry{at}
    103 {name={task}}
    104 {
    105 Abstract object representing an unit of work. Systems will offer one or more concrete implementations of this concept (\eg \gls{kthrd}, \gls{job}), however, most of the concept of schedulings are independent of the particular implementations of the work representation. For this reason, this document use the term \Gls{at} to mean any representation and not one in particular.
    106 }
    107 
    108102\longnewglossaryentry{atsched}
    109103{name={Scheduling a \gls{at}}}
    110104{
    111 Scheduling an \gls{at} refers to the act of notifying the scheduler that a task is ready to be ran. When representing the scheduler as a queue of tasks, scheduling is the act of pushing a task onto the end of the queue. This doesn't necesserily means the task will ever be allocated CPU time (\gls{atrun}), for example, if the system terminates abruptly, scheduled \glspl{at} will probably never run.
     105Scheduling a \at refers to notifying the scheduler that a \at is ready to run. When representing the scheduler as a queue of \ats, scheduling is the act of pushing a \at onto the end of the queue. This operation does not necessarily mean the \at is guaranteed CPU time (\gls{atrun}), \eg if the program terminates abruptly, scheduled \glspl{at} never run.
    112106
    113 \textit{Synonyms : None.}
     107\textit{Synonyms : Unparking.}
    114108}
    115109
     
    117111{name={Running a \gls{at}}}
    118112{
    119 Running an \gls{at} refers to the act of allocating CPU time to a task that is ready to run. When representing the scheduler as a queue of tasks, running is the act of poping a task from the front of the queue and putting it onto a \gls{proc}. The \gls{at} can than accomplish some or all of the work it is programmed to do.
     113Running a \at refers to allocating CPU time to a \at that is ready to run. When representing the scheduler as a queue of \ats, running is the act of popping a \at from the front of the queue and putting it onto a \gls{proc}. The \gls{at} can then accomplish some or all of the work it is programmed to do.
    120114
    121115\textit{Synonyms : None.}
     
    123117
    124118\longnewglossaryentry{atmig}
    125 {name={migration of \gls{at}}}
     119{name={\Glspl{at} Migration}}
    126120{
    127 Migration refers to the idea of an \gls{at} running on a different worker/processor than the last time it was run. It is generally preferable to minimise migration as it incurs cost but any load balancing among workers requires some amount of migration.
     121Migration refers to the idea of an \gls{at} running on a different \proc than the last time it was run. It is generally preferable to minimize migration as it incurs cost but any load balancing among \proc requires some amount of migration.
    128122
    129123\textit{Synonyms : None.}
     
    131125
    132126\longnewglossaryentry{atpass}
    133 {name={overtaking \gls{at}}}
     127{name={Overtaking \gls{at}}}
    134128{
    135129When representing the scheduler as a queue of \glspl{at}, overtaking is the act breaking the FIFO-ness of the queue by moving a \gls{at} in front of some other \gls{at} when it arrived after. This remains true for schedulers that do not use a FIFO queue, when the order in which the \glspl{at} are \glslink{atsched}{scheduled} and \glslink{atrun}{run} in a different order. A \gls{at} is said to \emph{overtake} another if it is run \emph{before} but was \emph{scheduled} after the other \gls{at}.
     
    139133
    140134\longnewglossaryentry{atblock}
    141 {name={Blocking an \gls{at}}}
     135{name={\Gls{at} Blocking}}
    142136{
    143 Blocking an abstract task refers to the act of taking a task that us running on a CPU off the CPU. Unless no other task is ready, this action is generally immediately followed by running an other task.
     137\Gls{at} blocking means taking a running \at off a CPU. Unless no other \at is ready, this action is immediately followed by running another \at.
    144138
    145 \textit{Synonyms : None.}
     139\textit{Synonyms : Parking.}
    146140}
    147141
     
    149143{name={Running to completion}}
    150144{
    151 Running to completion refers to the entire sequence of : being scheduled, running and blocking, for a given task.
     145Running to completion refers to the entire sequence of : being scheduled, running and blocking, for a given \at.
    152146
    153147See also \gls{atsched}, \gls{atrun}, \gls{atblock}
     
    157151
    158152\longnewglossaryentry{load}
    159 {name={System Load}}
     153{name={System Load},text={load}}
    160154{
    161 The load is refers to the rate at which \glspl{at} are \glslink{atsched}{scheduled} versus the rate at which they are \glslink{atrun}{run}. When \glspl{at} are being scheduled faster than they are run, the system is considered \emph{overloaded}. When \glspl{at} are being run faster than they are scheduled, the system is considered \emph{underloaded}. Conrrespondingly, if both rates are equal, the system is considered \emph{loaded}. Note that the system is considered loaded only of the rate at which \glspl{at} are scheduled/run is non-zero, otherwise the system is empty, it has no load.
     155The system load refers to the rate at which \glspl{at} are \glslink{atsched}{scheduled} versus the rate at which they are \glslink{atrun}{run}. When \glspl{at} are being scheduled faster than they are run, the system is considered \emph{overloaded}. When \glspl{at} are being run faster than they are scheduled, the system is considered \emph{underloaded}. Correspondingly, if both rates are equal, the system is considered \emph{loaded}. Note the system is considered loaded only if the rate at which \glspl{at} are scheduled/run is non-zero, otherwise the system is empty, \ie it has no load.
     156
     157\textit{Synonyms : CPU Load, System Load.}
    162158}
    163159
  • doc/theses/thierry_delisle_PhD/thesis/local.bib

    rebf8ca5 r23a08aa0  
    458458}
    459459
    460 
     460% Trevor's relaxed FIFO list
     461@inproceedings{alistarh2018relaxed,
     462  title={Relaxed schedulers can efficiently parallelize iterative algorithms},
     463  author={Alistarh, Dan and Brown, Trevor and Kopinsky, Justin and Nadiradze, Giorgi},
     464  booktitle={Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing},
     465  pages={377--386},
     466  year={2018}
     467}
     468
     469@article{zhuravlev2012survey,
     470  title={Survey of energy-cognizant scheduling techniques},
     471  author={Zhuravlev, Sergey and Saez, Juan Carlos and Blagodurov, Sergey and Fedorova, Alexandra and Prieto, Manuel},
     472  journal={IEEE Transactions on Parallel and Distributed Systems},
     473  volume={24},
     474  number={7},
     475  pages={1447--1464},
     476  year={2012},
     477  publisher={IEEE}
     478}
     479
     480@article{vikranth2013topology,
     481  title={Topology aware task stealing for on-chip NUMA multi-core processors},
     482  author={Vikranth, BRWACRR and Wankar, Rajeev and Rao, C Raghavendra},
     483  journal={Procedia Computer Science},
     484  volume={18},
     485  pages={379--388},
     486  year={2013},
     487  publisher={Elsevier}
     488}
     489
     490@inproceedings{min2011hierarchical,
     491  title={Hierarchical work stealing on manycore clusters},
     492  author={Min, Seung-Jai and Iancu, Costin and Yelick, Katherine},
     493  booktitle={Fifth Conference on Partitioned Global Address Space Programming Models (PGAS11)},
     494  volume={625},
     495  year={2011},
     496  organization={Citeseer}
     497}
     498
     499@article{ribic2014energy,
     500  title={Energy-efficient work-stealing language runtimes},
     501  author={Ribic, Haris and Liu, Yu David},
     502  journal={ACM SIGARCH Computer Architecture News},
     503  volume={42},
     504  number={1},
     505  pages={513--528},
     506  year={2014},
     507  publisher={ACM New York, NY, USA}
     508}
     509
     510@inproceedings{torng2016asymmetry,
     511  title={Asymmetry-aware work-stealing runtimes},
     512  author={Torng, Christopher and Wang, Moyang and Batten, Christopher},
     513  booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)},
     514  pages={40--52},
     515  year={2016},
     516  organization={IEEE}
     517}
    461518
    462519% --------------------------------------------------
     
    555612  title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide},
    556613  organization = {Apple Inc.},
    557   howPublish = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}}
     614  note = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://\-developer.apple.com/\-library/archive/\-documentation/\-Darwin/\-Conceptual/\-KernelProgramming/\-scheduler/\-scheduler.html}}
     615}
     616
     617@misc{MemcachedThreading,
     618  author = {Oracle},
     619  title = {MySQL 5.6 Reference Manual Including MySQL NDB Cluster 7.3-7.4 Reference Guide},
     620  howpublished = {\href{https://docs.oracle.com/cd/E17952_01/mysql-5.6-en/ha-memcached-using-threads.html}{https://docs.oracle.com/\-cd/E17952\_01/\-mysql-5.6-en/\-ha-memcached-using-threads.html}},
     621  note = "[Online; accessed 5-August-2022]"
    558622}
    559623
     
    650714}
    651715
     716@misc{GITHUB:SchedulingBenchmarks,
     717  title = {Scheduling Benchmarks},
     718  author = {Thierry Delisle},
     719  howpublished = {\href{https://github.com/cforall/SchedulingBenchmarks_PhD22}{https://\-github.com/\-cforall/\-SchedulingBenchmarks\_\-PhD22}},
     720}
     721
    652722% --------------------------------------------------
    653723% Tech documents
     
    758828}
    759829
     830@manual{MAN:eventfd,
     831  key        = "eventfd",
     832  title      = "eventfd(2) Linux User's Manual",
     833  year       = "2019",
     834  month      = "MArch",
     835}
     836
    760837@manual{MAN:aio,
    761838  key        = "aio",
     
    763840  year       = "2019",
    764841  month      = "March",
     842}
     843
     844@manual{MAN:bash,
     845  title   = {Bash Reference Manual},
     846  author  = {Chet Ramey and Brian Fox},
     847  year    = "2020",
     848  month   = "December",
     849  version = {5,1},
     850  howpublished = {\href{https://www.gnu.org/software/bash/manual/bash.pdf}{https://\-www.gnu.org/\-software/\-bash/\-manual/\-bash.pdf}}
    765851}
    766852
     
    774860}
    775861
     862
    776863% --------------------------------------------------
    777864% Wikipedia Entries
     
    870957  howpublished = "\href{https://en.wikipedia.org/wiki/Zipf%27s_law}{https://\-en.wikipedia.org/\-wiki/\-Zipf\%27s\-\_law}",
    871958  note = "[Online; accessed 5-August-2022]"
     959}
     960
     961@misc{wiki:htm,
     962  author = "{Wikipedia contributors}",
     963  title = "Transactional memory --- {W}ikipedia{,} The Free Encyclopedia",
     964  year = "2022",
     965  howpublished = "\href{https://en.wikipedia.org/wiki/Zipf%27s_law}{https://\-en.wikipedia.org/\-wiki/\-Zipf\%27s\-\_law}",
     966  note = "[Online; accessed 7-September-2022]"
    872967}
    873968
     
    9911086  note = "[Online; accessed 5-August-2022]"
    9921087}
     1088
     1089@article{reese2008nginx,
     1090    title       = {NGINX: the high-performance web server and reverse proxy},
     1091    author      = {Reese, Will},
     1092    journal     = {Linux Journal},
     1093    volume      = {2008},
     1094    number      = {173},
     1095    pages       = {2},
     1096    year        = {2008},
     1097    publisher   = {Belltown Media}
     1098}
     1099
     1100@phdthesis{Harji10,
     1101    author      = {Ashif Harji},
     1102    title       = {Performance Comparison of Uniprocessor and Multiprocessor Web Server Architectures},
     1103    school      = {University of Waterloo},
     1104    year        = 2010,
     1105    month       = feb,
     1106    address     = {Waterloo, Ontario, Canada, N2L 3G1},
     1107    note        = {\textsf{http://uwspace.uwaterloo.ca/\-bitstream/\-10012/\-5040/\-1/\-Harji\_thesis.pdf}},
     1108}
  • doc/theses/thierry_delisle_PhD/thesis/text/conclusion.tex

    rebf8ca5 r23a08aa0  
    11\chapter{Conclusion}\label{conclusion}
    22
    3 \Gls{uthrding} is popular.
    4 It makes sense for \CFA to use it.
     3Building the \CFA runtime has been a challenging project.
     4The work was divided between high-level concurrency design and a user-level threading runtime (Masters' thesis), and low-level support of the user-level runtime using OS kernel threading and its (multiple) I/O subsystems (Ph.D. thesis).
     5Because I am the main developer for both components of this project, there is strong continuity across the design and implementation.
     6This continuity provides a consistent approach to advanced control flow and concurrency, with easier development, management and maintenance of the runtime in the future.
    57
    6 \todo{Obivously fix the above}
     8I believed my Masters' work would provide the background to make the Ph.D. work reasonably straightforward.
     9However, I discovered two significant challenges.
    710
    8 An important aspect of this approach to threading is how threads are scheduled.
    9 As \CFA aims to increase productivity and safety of C while maintaining its performance, so to should the threading runtime achieve these goals.
    10 For scheduling, productivity and safety manifest in removing pitfalls in the efficient usage of the threading runtime.
    11 This thesis contributes to this goal by presenting a low-latency scheduler that offers improved starvation prevention compared to other state-of-the-art schedulers.
    12 It presents a core algorithm (Chapter~\ref{core}) that provides increased fairness through helping (Section~\ref{heling}) as well as optimizations which virtually remove the cost of this fairness (Section~\ref{relaxedtimes}).
    13 Building upon the fundamental scheduling algorithm, an implementation of user-level \io blocking is presented (Chapter~\ref{io}) which achieves the same performance and fairness balance as the scheduler itself.
    14 From these core algorithms, and a low-latency idle-sleep mechanism is presented (Chapter~\ref{practice}) which allows the \CFA runtime to stay viable for workloads that do not consistently saturate the system.
     11First, modern symmetric multiprocessing CPUs have significant performance penalties for communication, often cache-related.
     12An SQMS scheduler (see Section~\ref{sched}), with its \proc-shared ready-queue, has perfect load-balancing but poor affinity resulting in high communication across \procs.
     13An MQMS scheduler, with its \proc-specific ready-queues, has poor load-balancing but perfect affinity often resulting in significantly reduced communication.
     14However, implementing fairness for an MQMS scheduler is difficult, since fairness requires \procs to be aware of each other's ready-queue progress, \ie communicated knowledge.
     15For balanced workloads with little or no data sharing, \ie embarrassingly parallel, an MQMS scheduler is near optimal, \eg a state-of-the-art work-stealing scheduler.
     16For these kinds of fair workloads, adding fairness must be low-cost to hide the communication costs needed for global ready-queue progress or performance suffers.
     17While I was aware of these realities, I underestimated how little performance margin there is for communication.
     18Several of my attempts at building a fair scheduler compared poorly to work-stealing schedulers because of the thin communication margin.
     19
     20Second, the kernel locking, threading, and I/O in the Linux operating system offer very little flexibility and are not designed to facilitate user-level threading.
     21There are multiple concurrency aspects in Linux that require carefully following a strict procedure to achieve acceptable performance.
     22To be fair, many of these concurrency aspects were designed 30-40 years ago, when there were few multiprocessor computers and concurrency knowledge was just developing.
     23Unfortunately, little has changed in the intervening years.
     24
     25Also, my decision to use @io_uring@ was both positive and negative.
     26The positive is that @io_uring@ supports the panoply of I/O mechanisms in Linux;
     27hence, the \CFA runtime uses one I/O mechanism to provide non-blocking I/O, rather than using @select@ to handle TTY I/O, @epoll@ to handle network I/O, and managing a thread pool to handle disk I/O.
     28Merging all these different \io mechanisms into a coherent scheduling implementation would require much more work than what is present in this thesis, as well as detailed knowledge of multiple I/O mechanisms.
     29The negative is that @io_uring@ is new and developing.
     30As a result, there is limited documentation, few places to find usage examples, and multiple errors that required workarounds.
     31
     32Given what I now know about @io_uring@, I would say it is insufficiently coupled with the Linux kernel to properly handle non-blocking I/O.
     33It does not seem to reach deep into the kernel's handling of \io, and as such it must contend with the same realities that users of @epoll@ must contend with.
     34Specifically, in cases where @O_NONBLOCK@ behaves as desired, operations must still be retried.
     35Preserving the illusion of asynchronicity requires delegating these operations to kernel threads.
     36This requirement is also true of cases where @O_NONBLOCK@ does not prevent blocking.
     37Spinning up internal kernel threads to handle blocking scenarios is what developers already do outside of the kernel, and managing these threads adds a significant burden to the system.
     38Nonblocking I/O should not be handled in this way.
     39
     40\section{Goals}
     41This work focuses on efficient and fair scheduling of the multiple CPUs, which are ubiquitous on all modern computers.
     42The levels of indirection to the CPUs are:
     43\begin{itemize}
     44\item
     45The \CFA presentation of concurrency through multiple high-level language constructs.
     46\item
     47The OS presentation of concurrency through multiple kernel threads within an application.
     48\item
     49The OS and library presentation of disk and network I/O, and many secondary library routines that directly and indirectly use these mechanisms.
     50\end{itemize}
     51The key aspect of all of these mechanisms is that control flow can block, which immediately hinders any level above from making scheduling decisions as a result.
     52Fundamentally, scheduling needs to understand all the mechanisms used by threads that affect their state changes.
     53
     54The underlying goal of this thesis is scheduling the complex hardware components that make up a computer to provide good utilization and fairness.
     55However, direct hardware scheduling is only possible in the OS.
     56Instead, this thesis is performing arms-length application scheduling of the hardware components through a set of OS interfaces that indirectly manipulate the hardware components.
     57This can quickly lead to tensions when the OS interface has different use cases in mind.
     58
     59As \CFA aims to increase productivity and safety of C, while maintaining its performance, this places a huge burden on the \CFA runtime to achieve these goals.
     60Productivity and safety manifest in removing scheduling pitfalls in the efficient usage of the threading runtime.
     61Performance manifests in making efficient use of the underlying kernel threads that provide indirect access to the CPUs.
     62
     63This thesis achieves its stated contributions by presenting:
     64\begin{enumerate}[leftmargin=*]
     65\item
     66A scalable low-latency scheduler that offers improved starvation prevention (progress guarantee) compared to other state-of-the-art schedulers, including NUMA awareness.
     67\item
     68The scheduler demonstrates a core algorithm that provides increased fairness through helping, as well as optimizations which virtually remove the cost of this fairness.
     69\item
     70An implementation of user-level \io blocking is incorporated into the scheduler, which achieves the same performance and fairness balance as the scheduler itself.
     71\item
     72These core algorithms are further extended with a low-latency idle-sleep mechanism, which allows the \CFA runtime to stay viable for workloads that do not consistently saturate the system.
     73\end{enumerate}
     74Finally, the complete scheduler is fairly simple with low-cost execution, meaning the total cost of scheduling during thread state changes is low.
    1575
    1676\section{Future Work}
    17 While the \CFA runtime achieves a better compromise in term of performance and fairness than other schedulers, I do believe that further improvements could be made to reduce even further the number of cases where performance deteriorates.
    18 Furthermore, I believe that achieve performance and starvation freedom simultaneously is generally a challenge even outside of scheduling algorithms.
     77While the \CFA runtime achieves a better compromise than other schedulers, in terms of performance and fairness, I believe further improvements can be made to reduce or eliminate the few cases where performance does deteriorate.
     78Fundamentally, achieving performance and starvation freedom will always be goals with opposing needs even outside of scheduling algorithms.
    1979
    2080\subsection{Idle Sleep}
    21 A difficult challenge that was not fully address in this thesis is idle-sleep.
    22 While a correct and somewhat low-cost idle-sleep mechanism was presented, several of the benchmarks show notable performance degradation when too few \ats are present in the system.
     81A difficult challenge, not fully addressed in this thesis, is idle sleep.
     82While a correct and somewhat low-cost idle-sleep mechanism is presented, several of the benchmarks show notable performance degradation when too few \ats are present in the system.
    2383The idle sleep mechanism could therefore benefit from a reduction of spurious cases of sleeping.
    2484Furthermore, this thesis did not present any heuristic for when \procs should be put to sleep and when \procs should be woken up.
    25 It is especially worth noting that relaxed timestamps and topology aware helping lead to notable improvements in performance.
    26 Neither of these techniques were used for the idle sleep mechanism.
     85While relaxed timestamps and topology awareness made notable performance improvements, neither of these techniques are used for the idle-sleep mechanism.
    2786
    28 There are opportunities where these techniques could be use:
    29 The mechanism uses a hand-shake between notification and sleep to ensure that no \at is missed.
    30 The correctness of that hand-shake is cirtical when the last \proc goes to sleep but could be relaxed when several \procs are awake.
    31 Furthermore, organizing the sleeping \procs as a LIDO stack makes sense to keep cold \procs as cold as possible, but it might be more appropriate to attempt to keep cold CPU sockets instead.
    32 
    33 However, using these techniques could require significant investigation.
    34 For example, keeping a CPU socket cold might be appropriate for power consumption reasons but can affect overall memory bandwith.
    35 The balance between these is not necessarily obvious.
     87Here are opportunities where these techniques could be used:
     88\begin{itemize}
     89\item
     90The mechanism uses a handshake between notification and sleep to ensure that no \at is missed.
     91\item
     92The handshake correctness is critical when the last \proc goes to sleep but could be relaxed when several \procs are awake.
     93\item
     94Furthermore, organizing the sleeping \procs as a LIFO stack makes sense to keep cold \procs as cold as possible, but it might be more appropriate to attempt to keep cold CPU sockets instead.
     95\end{itemize}
     96However, using these techniques would require significant investigation.
     97For example, keeping a CPU socket cold might be appropriate for power consumption reasons but can affect overall memory bandwidth.
     98The balance between these approaches is not obvious.
     99I am aware there is a host of low-power research that could be tapped here.
    36100
    37101\subsection{Hardware}
    38 One challenge that needed to be overcome for this thesis was that the modern x86-64 has very few tools to implement fairness.
    39 \Glspl{proc} attempting to help eachother inherently cause cache-coherence traffic.
     102One challenge that needed to be overcome for this thesis is that the modern x86-64 processors have very few tools to implement fairness.
     103\Glspl{proc} attempting to help each other inherently cause cache-coherence traffic.
    40104However, as mentioned in Section~\ref{helping}, relaxed requirements mean this traffic is not necessarily productive.
    41105In cases like this one, there is an opportunity to improve performance by extending the hardware.
    42106
    43 Many different extensions would be suitable here.
    44 For example, when attempting to read remote timestamps when deciding to whether or not to help, it could be useful to allow cancelling the remote read if it will lead to significant latency.
    45 If the latency is due to a recent cache invalidation, it is unlikely that the timestamp is old and that helping will be needed.
     107Many different extensions are suitable here.
     108For example, when attempting to read remote timestamps for helping, it would be useful to allow cancelling the remote read if it leads to significant latency.
     109If the latency is due to a recent cache invalidation, it is unlikely the timestamp is old and that helping is needed.
    46110As such, simply moving on without the result is likely to be acceptable.
    47 Another option would be to attempt to read multiple memory addresses and only wait for \emph{one of} these reads to retire.
    48 This would have a similar effect, where cache-lines with more traffic would be waited on less often.
    49 In both of these examples, some care would probably be needed to make sure that the reads to an address \emph{sometimes} retire.
     111Another option is to read multiple memory addresses and only wait for \emph{one of} these reads to retire.
     112This approach has a similar effect, where cache lines with more traffic are waited on less often.
     113In both of these examples, some care is needed to ensure that reads to an address \emph{sometimes} retire.
    50114
    51 Note that this is similar to the feature \newterm{Hardware Transactional Memory}~\cite{HTM}, which allows groups of instructions to be aborted and rolled-back if they encounter memory conflicts when being retired.
     115Note that this idea is similar to \newterm{Hardware Transactional Memory}~\cite{wiki:htm}, which allows groups of instructions to be aborted and rolled back if they encounter memory conflicts when being retired.
    52116However, I believe this feature is generally aimed at large groups of instructions.
    53 A more fine-grained approach may be more amenable to carefully picking which aspects of an algorithm require exact correctness and which do not.
     117A more fine-grained approach may be more amenable by carefully picking which aspects of an algorithm require exact correctness and which do not.
  • doc/theses/thierry_delisle_PhD/thesis/text/core.tex

    rebf8ca5 r23a08aa0  
    22
    33Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state.
    4 For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers.
     4For this purpose, a steady state is loosely defined as a state where there are always \ats ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers.
    55In short, the system is neither overloaded nor underloaded.
    66
    77It is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected.
    8 As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new load and return to the steady state, \eg, by adding or removing workers.
     8As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new \gls{load} and return to the steady state, \eg, by adding or removing workers.
    99Therefore, flaws in scheduling the steady state tend to be pervasive in all states.
    1010
    1111\section{Design Goals}
    12 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental-model.
    13 To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental-model, the system also respects this model.
    14 
    15 For threading, a simple and common execution mental-model is the ``Ideal multi-tasking CPU'' :
     12As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental model.
     13To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental model, the system also respects this model.
     14
     15For threading, a simple and common execution mental model is the ``ideal multitasking CPU'':
    1616
    1717\begin{displayquote}[Linux CFS\cite{MAN:linux/cfs}]
    18         {[The]} ``Ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 tasks running, then it runs each at 50\% physical power --- i.e., actually in parallel.
     18        {[The]} ``ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 running tasks, then it runs each at 50\% physical power --- i.e., actually in parallel.
    1919        \label{q:LinuxCFS}
    2020\end{displayquote}
    2121
    22 Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
    23 
    24 In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with each other but simply share the hardware.
    25 This assumption makes it easier to reason about threading because ready \glspl{thrd} can be thought of in isolation and the effect of the scheduler can be virtually ignored.
    26 This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees:
     22Applied to \ats, this model states that every ready \at immediately runs in parallel with all other ready \ats. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
     23
     24In general, the expectation at the centre of this model is that ready \ats do not interfere with each other but simply share the hardware.
     25This assumption makes it easier to reason about threading because ready \ats can be thought of in isolation and the effect of the scheduler can be virtually ignored.
     26This expectation of \at independence means the scheduler is expected to offer two guarantees:
    2727\begin{enumerate}
    28         \item A fairness guarantee: a \gls{thrd} that is ready to run is not prevented by another thread.
    29         \item A performance guarantee: a \gls{thrd} that wants to start or stop running is not prevented by other threads wanting to do the same.
     28        \item A fairness guarantee: a \at that is ready to run is not prevented by another thread.
     29        \item A performance guarantee: a \at that wants to start or stop running is not prevented by other threads wanting to do the same.
    3030\end{enumerate}
    3131
    3232It is important to note that these guarantees are expected only up to a point.
    33 \Glspl{thrd} that are ready to run should not be prevented to do so, but they still share the limited hardware resources.
    34 Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware resources, even if that share is very small.
     33\Glspl{at} that are ready to run should not be prevented from doing so, but they still share the limited hardware resources.
     34Therefore, the guarantee is considered respected if a \at gets access to a \emph{fair share} of the hardware resources, even if that share is very small.
    3535
    3636Similar to the performance guarantee, the lack of interference among threads is only relevant up to a point.
     
    4040This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models.
    4141Recall programmer expectation is that the impact of the scheduler can be ignored.
    42 Therefore, if the cost of scheduling is competitive to other popular languages, the guarantee is consider achieved.
     42Therefore, if the cost of scheduling is competitive with other popular languages, the guarantee is considered achieved.
    4343More precisely the scheduler should be:
    4444\begin{itemize}
     
    5353In any running system, a \proc can stop dequeuing \ats if it starts running a \at that never blocks.
    5454Without preemption, traditional work-stealing schedulers do not have starvation freedom in this case.
    55 Now this requirement begs the question, what about preemption?
    56 Generally speaking preemption happens on the timescale of several milliseconds, which brings us to the next requirement: ``fast'' load balancing.
     55Now, this requirement begs the question, what about preemption?
     56Generally speaking, preemption happens on the timescale of several milliseconds, which brings us to the next requirement: ``fast'' load balancing.
    5757
    5858\paragraph{Fast load balancing} means that load balancing should happen faster than preemption would normally allow.
    59 For interactive applications that need to run at 60, 90, 120 frames per second, \ats having to wait for several milliseconds to run are effectively starved.
     59For interactive applications that need to run at 60, 90 or 120 frames per second, \ats having to wait for several milliseconds to run are effectively starved.
    6060Therefore load-balancing should be done at a faster pace, one that can detect starvation at the microsecond scale.
    61 With that said, this is a much fuzzier requirement since it depends on the number of \procs, the number of \ats and the general load of the system.
     61With that said, this is a much fuzzier requirement since it depends on the number of \procs, the number of \ats and the general \gls{load} of the system.
    6262
    6363\subsection{Fairness vs Scheduler Locality} \label{fairnessvlocal}
     
    6868
    6969For a scheduler, having good locality, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness.
    70 Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available.
    71 Note that this section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling.
     70Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \at, and as consequence cache lines, to a \gls{hthrd} that is currently available.
     71Note that this section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how scheduling affects the locality of the application's data.
    7272External locality is a much more complicated subject and is discussed in the next section.
    7373
    7474However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally.
    7575Figure~\ref{fig:fair} shows a visual representation of this behaviour.
    76 As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental-model.
     76As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental model.
    7777
    7878\begin{figure}
     
    8080        \input{fairness.pstex_t}
    8181        \vspace*{-10pt}
    82         \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache decreases, Locality.
    83         At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model.
     82        \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \at awaits running is shown as the time the ready \at waits increases (Ready Time) the chances that its data is still in cache decreases (Locality).
     83        At the same time, the need for fairness increases since other \ats may have the chance to run many times, breaking the fairness model.
    8484        Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.}
    8585        \label{fig:fair}
     
    9292\subsubsection{Scalability}
    9393The most basic performance challenge of a scheduler is scalability.
    94 Given a large number of \procs and an even larger number of \ats, scalability measures how fast \procs can enqueue and dequeues \ats.
    95 One could expect that doubling the number of \procs would double the rate at which \ats are dequeued, but contention on the internal data structure of the scheduler can lead to worst improvements.
    96 While the ready-queue itself can be sharded to alleviate the main source of contention, auxiliary scheduling features, \eg counting ready \ats, can also be sources of contention.
     94Given a large number of \procs and an even larger number of \ats, scalability measures how fast \procs can enqueue and dequeue \ats.
     95One could expect that doubling the number of \procs would double the rate at which \ats are dequeued, but contention on the internal data structure of the scheduler can diminish the improvements.
     96While the ready queue itself can be sharded to alleviate the main source of contention, auxiliary scheduling features, \eg counting ready \ats, can also be sources of contention.
    9797
    9898\subsubsection{Migration Cost}
    99 Another important source of scheduling latency is migration.
     99Another important source of scheduling latency is \glslink{atmig}{migration}.
    100100A \at migrates if it executes on two different \procs consecutively, which is the process discussed in \ref{fairnessvlocal}.
    101101Migrations can have many different causes, but in certain programs, it can be impossible to limit migration.
     
    108108The problem is a single point of contention when adding/removing \ats.
    109109As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}.
    110 The solution to this problem is to shard the ready-queue: create multiple \emph{subqueues} forming the logical ready-queue and the subqueues are accessed by multiple \glspl{hthrd} without interfering.
     110The solution to this problem is to shard the ready queue: create multiple \emph{sub-queues} forming the logical ready-queue and the sub-queues are accessed by multiple \glspl{hthrd} without interfering.
    111111
    112112Before going into the design of \CFA's scheduler, it is relevant to discuss two sharding solutions that served as the inspiration scheduler in this thesis.
     
    114114\subsection{Work-Stealing}
    115115
    116 As mentioned in \ref{existing:workstealing}, a popular sharding approach for the ready-queue is work-stealing.
    117 In this approach, each \gls{proc} has its own local subqueue and \glspl{proc} only access each other's subqueue if they run out of work on their local ready-queue.
     116As mentioned in \ref{existing:workstealing}, a popular sharding approach for the ready queue is work-stealing.
     117In this approach, each \gls{proc} has its own local sub-queue and \glspl{proc} only access each other's sub-queue if they run out of work on their local ready-queue.
    118118The interesting aspect of work stealing happens in the steady-state scheduling case, \ie all \glspl{proc} have work and no load balancing is needed.
    119119In this case, work stealing is close to optimal scheduling: it can achieve perfect locality and have no contention.
    120120On the other hand, work-stealing schedulers only attempt to do load-balancing when a \gls{proc} runs out of work.
    121121This means that the scheduler never balances unfair loads unless they result in a \gls{proc} running out of work.
    122 Chapter~\ref{microbench} shows that pathological cases work stealing can lead to indefinite starvation.
    123 
    124 Based on these observation, the conclusion is that a \emph{perfect} scheduler should behave similar to work-stealing in the steady-state case, but load balance proactively when the need arises.
     122Chapter~\ref{microbench} shows that, in pathological cases, work stealing can lead to indefinite starvation.
     123
     124Based on these observations, the conclusion is that a \emph{perfect} scheduler should behave similarly to work-stealing in the steady-state case, but load balance proactively when the need arises.
    125125
    126126\subsection{Relaxed-FIFO}
    127 A different scheduling approach is to create a ``relaxed-FIFO'' queue, as in \todo{cite Trevor's paper}.
    128 This approach forgoes any ownership between \gls{proc} and subqueue, and simply creates a pool of ready-queues from which \glspl{proc} pick.
     127A different scheduling approach is to create a ``relaxed-FIFO'' queue, as in \cite{alistarh2018relaxed}.
     128This approach forgoes any ownership between \gls{proc} and sub-queue, and simply creates a pool of sub-queues from which \glspl{proc} pick.
    129129Scheduling is performed as follows:
    130130\begin{itemize}
    131131\item
    132 All subqueues are protected by TryLocks.
    133 \item
    134 Timestamps are added to each element of a subqueue.
    135 \item
    136 A \gls{proc} randomly tests ready queues until it has acquired one or two queues.
    137 \item
    138 If two queues are acquired, the older of the two \ats at the front the acquired queues is dequeued.
    139 \item
    140 Otherwise the \ats from the single queue is dequeued.
     132All sub-queues are protected by TryLocks.
     133\item
     134Timestamps are added to each element of a sub-queue.
     135\item
     136A \gls{proc} randomly tests sub-queues until it has acquired one or two queues.
     137\item
     138If two queues are acquired, the older of the two \ats is dequeued from the front of the acquired queues.
     139\item
     140Otherwise, the \at from the single queue is dequeued.
    141141\end{itemize}
    142142The result is a queue that has both good scalability and sufficient fairness.
    143143The lack of ownership ensures that as long as one \gls{proc} is still able to repeatedly dequeue elements, it is unlikely any element will delay longer than any other element.
    144 This guarantee contrasts with work-stealing, where a \gls{proc} with a long subqueue results in unfairness for its \ats in comparison to a \gls{proc} with a short subqueue.
     144This guarantee contrasts with work-stealing, where a \gls{proc} with a long sub-queue results in unfairness for its \ats in comparison to a \gls{proc} with a short sub-queue.
    145145This unfairness persists until a \gls{proc} runs out of work and steals.
    146146
    147 An important aspects of this scheme's fairness approach is that the timestamps make it possible to evaluate how long elements have been on the queue.
     147An important aspect of this scheme's fairness approach is that the timestamps make it possible to evaluate how long elements have been in the queue.
    148148However, \glspl{proc} eagerly search for these older elements instead of focusing on specific queues, which negatively affects locality.
    149149
     
    152152
    153153\section{Relaxed-FIFO++}
    154 The inherent fairness and good performance with many \ats, makes the relaxed-FIFO queue a good candidate to form the basis of a new scheduler.
     154The inherent fairness and good performance with many \ats make the relaxed-FIFO queue a good candidate to form the basis of a new scheduler.
    155155The problem case is workloads where the number of \ats is barely greater than the number of \procs.
    156 In these situations, the wide sharding of the ready queue means most of its subqueues are empty.
    157 Furthermore, the non-empty subqueues are unlikely to hold more than one item.
    158 The consequence is that a random dequeue operation is likely to pick an empty subqueue, resulting in an unbounded number of selections.
    159 This state is generally unstable: each subqueue is likely to frequently toggle between being empty and nonempty.
    160 Indeed, when the number of \ats is \emph{equal} to the number of \procs, every pop operation is expected to empty a subqueue and every push is expected to add to an empty subqueue.
    161 In the worst case, a check of the subqueues sees all are empty or full.
     156In these situations, the wide sharding of the ready queue means most of its sub-queues are empty.
     157Furthermore, the non-empty sub-queues are unlikely to hold more than one item.
     158The consequence is that a random dequeue operation is likely to pick an empty sub-queue, resulting in an unbounded number of selections.
     159This state is generally unstable: each sub-queue is likely to frequently toggle between being empty and nonempty.
     160Indeed, when the number of \ats is \emph{equal} to the number of \procs, every pop operation is expected to empty a sub-queue and every push is expected to add to an empty sub-queue.
     161In the worst case, a check of the sub-queues sees all are empty or full.
    162162
    163163As this is the most obvious challenge, it is worth addressing first.
    164 The obvious solution is to supplement each sharded subqueue with data that indicates if the queue is empty/nonempty to simplify finding nonempty queues, \ie ready \glspl{at}.
    165 This sharded data can be organized in different forms, \eg a bitmask or a binary tree that tracks the nonempty subqueues.
     164The obvious solution is to supplement each sharded sub-queue with data that indicates if the queue is empty/nonempty to simplify finding nonempty queues, \ie ready \glspl{at}.
     165This sharded data can be organized in different forms, \eg a bitmask or a binary tree that tracks the nonempty sub-queues.
    166166Specifically, many modern architectures have powerful bitmask manipulation instructions or searching a binary tree has good Big-O complexity.
    167 However, precisely tracking nonempty subqueues is problematic.
    168 The reason is that the subqueues are initially sharded with a width presumably chosen to avoid contention.
    169 However, tracking which ready queue is nonempty is only useful if the tracking data is dense, \ie denser than the sharded subqueues.
    170 Otherwise, it does not provide useful information because reading this new data structure risks being as costly as simply picking a subqueue at random.
    171 But if the tracking mechanism \emph{is} denser than the shared subqueues, than constant updates invariably create a new source of contention.
     167However, precisely tracking nonempty sub-queues is problematic.
     168The reason is that the sub-queues are initially sharded with a width presumably chosen to avoid contention.
     169However, tracking which ready queue is nonempty is only useful if the tracking data is dense, \ie denser than the sharded sub-queues.
     170Otherwise, it does not provide useful information because reading this new data structure risks being as costly as simply picking a sub-queue at random.
     171But if the tracking mechanism \emph{is} denser than the shared sub-queues, then constant updates invariably create a new source of contention.
    172172Early experiments with this approach showed that randomly picking, even with low success rates, is often faster than bit manipulations or tree walks.
    173173
    174174The exception to this rule is using local tracking.
    175 If each \proc locally keeps track of empty subqueues, than this can be done with a very dense data structure without introducing a new source of contention.
     175If each \proc locally keeps track of empty sub-queues, then this can be done with a very dense data structure without introducing a new source of contention.
    176176However, the consequence of local tracking is that the information is incomplete.
    177 Each \proc is only aware of the last state it saw about each subqueue so this information quickly becomes stale.
     177Each \proc is only aware of the last state it saw about each sub-queue so this information quickly becomes stale.
    178178Even on systems with low \gls{hthrd} count, \eg 4 or 8, this approach can quickly lead to the local information being no better than the random pick.
    179179This result is due in part to the cost of maintaining information and its poor quality.
    180180
    181 However, using a very low cost but inaccurate approach for local tracking can actually be beneficial.
    182 If the local tracking is no more costly than a random pick, than \emph{any} improvement to the success rate, however low it is, leads to a performance benefits.
    183 This suggests to the following approach:
     181However, using a very low-cost but inaccurate approach for local tracking can still be beneficial.
     182If the local tracking is no more costly than a random pick, then \emph{any} improvement to the success rate, however low it is, leads to a performance benefit.
     183This suggests the following approach:
    184184
    185185\subsection{Dynamic Entropy}\cite{xkcd:dynamicentropy}
    186 The Relaxed-FIFO approach can be made to handle the case of mostly empty subqueues by tweaking the \glsxtrlong{prng}.
    187 The \glsxtrshort{prng} state can be seen as containing a list of all the future subqueues that will be accessed.
    188 While this concept is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the subqueues that were accessed.
     186The Relaxed-FIFO approach can be made to handle the case of mostly empty sub-queues by tweaking the \glsxtrlong{prng}.
     187The \glsxtrshort{prng} state can be seen as containing a list of all the future sub-queues that will be accessed.
     188While this concept is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the sub-queues that were accessed.
    189189Luckily, bidirectional \glsxtrshort{prng} algorithms do exist, \eg some Linear Congruential Generators\cite{wiki:lcg} support running the algorithm backwards while offering good quality and performance.
    190190This particular \glsxtrshort{prng} can be used as follows:
    191191\begin{itemize}
    192192\item
    193 Each \proc maintains two \glsxtrshort{prng} states, refereed to as $F$ and $B$.
    194 \item
    195 When a \proc attempts to dequeue a \at, it picks a subqueue by running $B$ backwards.
    196 \item
    197 When a \proc attempts to enqueue a \at, it runs $F$ forward picking a subqueue to enqueue to.
    198 If the enqueue is successful, the state $B$ is overwritten with the content of $F$.
     193Each \proc maintains two \glsxtrshort{prng} states, referred to as $F$ and $B$.
     194\item
     195When a \proc attempts to dequeue a \at, it picks a sub-queue by running $B$ backwards.
     196\item
     197When a \proc attempts to enqueue a \at, it runs $F$ forward picking a sub-queue to enqueue to.
     198If the enqueue is successful, state $B$ is overwritten with the content of $F$.
    199199\end{itemize}
    200200The result is that each \proc tends to dequeue \ats that it has itself enqueued.
    201 When most subqueues are empty, this technique increases the odds of finding \ats at very low cost, while also offering an improvement on locality in many cases.
     201When most sub-queues are empty, this technique increases the odds of finding \ats at a very low cost, while also offering an improvement on locality in many cases.
    202202
    203203Tests showed this approach performs better than relaxed-FIFO in many cases.
    204204However, it is still not competitive with work-stealing algorithms.
    205205The fundamental problem is that the constant randomness limits how much locality the scheduler offers.
    206 This becomes problematic both because the scheduler is likely to get cache misses on internal data-structures and because migrations become frequent.
     206This becomes problematic both because the scheduler is likely to get cache misses on internal data structures and because migrations become frequent.
    207207Therefore, the attempt to modify the relaxed-FIFO algorithm to behave more like work stealing did not pan out.
    208208The alternative is to do it the other way around.
     
    210210\section{Work Stealing++}\label{helping}
    211211To add stronger fairness guarantees to work stealing a few changes are needed.
    212 First, the relaxed-FIFO algorithm has fundamentally better fairness because each \proc always monitors all subqueues.
     212First, the relaxed-FIFO algorithm has fundamentally better fairness because each \proc always monitors all sub-queues.
    213213Therefore, the work-stealing algorithm must be prepended with some monitoring.
    214 Before attempting to dequeue from a \proc's subqueue, the \proc must make some effort to ensure other subqueues are not being neglected.
     214Before attempting to dequeue from a \proc's sub-queue, the \proc must make some effort to ensure other sub-queues are not being neglected.
    215215To make this possible, \procs must be able to determine which \at has been on the ready queue the longest.
    216216Second, the relaxed-FIFO approach needs timestamps for each \at to make this possible.
     
    219219        \centering
    220220        \input{base.pstex_t}
    221         \caption[Base \CFA design]{Base \CFA design \smallskip\newline A pool of subqueues offers the sharding, two per \glspl{proc}.
    222         Each \gls{proc} can access all of the subqueues.
     221        \caption[Base \CFA design]{Base \CFA design \smallskip\newline A pool of sub-queues offers the sharding, two per \proc.
     222        Each \gls{proc} can access all of the sub-queues.
    223223        Each \at is timestamped when enqueued.}
    224224        \label{fig:base}
     
    226226
    227227Figure~\ref{fig:base} shows the algorithm structure.
    228 This structure is similar to classic work-stealing except the subqueues are placed in an array so \procs can access them in constant time.
     228This structure is similar to classic work-stealing except the sub-queues are placed in an array so \procs can access them in constant time.
    229229Sharding width can be adjusted based on contention.
    230230Note, as an optimization, the TS of a \at is stored in the \at in front of it, so the first TS is in the array and the last \at has no TS.
    231231This organization keeps the highly accessed front TSs directly in the array.
    232 When a \proc attempts to dequeue a \at, it first picks a random remote subqueue and compares its timestamp to the timestamps of its local subqueue(s).
     232When a \proc attempts to dequeue a \at, it first picks a random remote sub-queue and compares its timestamp to the timestamps of its local sub-queue(s).
    233233The oldest waiting \at is dequeued to provide global fairness.
    234234
    235 However, this na\"ive implemented has performance problems.
     235However, this na\"ive implementation has performance problems.
    236236First, it is necessary to have some damping effect on helping.
    237237Random effects like cache misses and preemption can add spurious but short bursts of latency negating the attempt to help.
    238 These bursts can cause increased migrations and make this work stealing approach slowdown to the level of relaxed-FIFO.
     238These bursts can cause increased migrations and make this work-stealing approach slow down to the level of relaxed-FIFO.
    239239
    240240\begin{figure}
    241241        \centering
    242242        \input{base_avg.pstex_t}
    243         \caption[\CFA design with Moving Average]{\CFA design with Moving Average \smallskip\newline A moving average is added to each subqueue.}
     243        \caption[\CFA design with Moving Average]{\CFA design with Moving Average \smallskip\newline A moving average is added to each sub-queue.}
    244244        \label{fig:base-ma}
    245245\end{figure}
    246246
    247 A simple solution to this problem is to use an exponential moving average\cite{wiki:ma} (MA) instead of a raw timestamps, shown in Figure~\ref{fig:base-ma}.
    248 Note, this is more complex because the \at at the head of a subqueue is still waiting, so its wait time has not ended.
    249 Therefore, the exponential moving average is actually an exponential moving average of how long each dequeued \at has waited.
    250 To compare subqueues, the timestamp at the head must be compared to the current time, yielding the best-case wait-time for the \at at the head of the queue.
     247A simple solution to this problem is to use an exponential moving average\cite{wiki:ma} (MA) instead of a raw timestamp, as shown in Figure~\ref{fig:base-ma}.
     248Note that this is more complex because the \at at the head of a sub-queue is still waiting, so its wait time has not ended.
     249Therefore, the exponential moving average is an average of how long each dequeued \at has waited.
     250To compare sub-queues, the timestamp at the head must be compared to the current time, yielding the best-case wait time for the \at at the head of the queue.
    251251This new waiting is averaged with the stored average.
    252 To further limit migration, a bias can be added to a local subqueue, where a remote subqueue is helped only if its moving average is more than $X$ times the local subqueue's average.
     252To further limit \glslink{atmig}{migrations}, a bias can be added to a local sub-queue, where a remote sub-queue is helped only if its moving average is more than $X$ times the local sub-queue's average.
    253253Tests for this approach indicate the choice of the weight for the moving average or the bias is not important, \ie weights and biases of similar \emph{magnitudes} have similar effects.
    254254
    255255With these additions to work stealing, scheduling can be made as fair as the relaxed-FIFO approach, avoiding the majority of unnecessary migrations.
    256 Unfortunately, the work to achieve fairness has a performance cost, especially when the workload is inherently fair, and hence, there is only short-term or no starvation.
    257 The problem is that the constant polling, \ie reads, of remote subqueues generally entail a cache miss because the TSs are constantly being updated, \ie, writes.
    258 To make things worst, remote subqueues that are very active, \ie \ats are frequently enqueued and dequeued from them, lead to higher chances that polling will incur a cache-miss.
    259 Conversely, the active subqueues do not benefit much from helping since starvation is already a non-issue.
    260 This puts this algorithm in the awkward situation of paying for a cost that is largely unnecessary.
    261 The good news is that this problem can be mitigated
    262 
    263 \subsection{Redundant Timestamps}\ref{relaxedtimes}
    264 The problem with polling remote subqueues is that correctness is critical.
    265 There must be a consensus among \procs on which subqueues hold which \ats, as the \ats are in constant motion.
    266 Furthermore, since timestamps are use for fairness, it is critical to have consensus on which \at is the oldest.
    267 However, when deciding if a remote subqueue is worth polling, correctness is less of a problem.
    268 Since the only requirement is that a subqueue is eventually polled, some data staleness is acceptable.
     256Unfortunately, the work to achieve fairness has a performance cost, especially when the workload is inherently fair, and hence, there is only short-term unfairness or no starvation.
     257The problem is that the constant polling, \ie reads, of remote sub-queues generally entails cache misses because the TSs are constantly being updated, \ie, writes.
     258To make things worst, remote sub-queues that are very active, \ie \ats are frequently enqueued and dequeued from them, lead to higher chances that polling will incur a cache-miss.
     259Conversely, the active sub-queues do not benefit much from helping since starvation is already a non-issue.
     260This puts this algorithm in the awkward situation of paying for a largely unnecessary cost.
     261The good news is that this problem can be mitigated.
     262
     263\subsection{Redundant Timestamps}\label{relaxedtimes}
     264The problem with polling remote sub-queues is that correctness is critical.
     265There must be a consensus among \procs on which sub-queues hold which \ats, as the \ats are in constant motion.
     266Furthermore, since timestamps are used for fairness, it is critical to have a consensus on which \at is the oldest.
     267However, when deciding if a remote sub-queue is worth polling, correctness is less of a problem.
     268Since the only requirement is that a sub-queue is eventually polled, some data staleness is acceptable.
    269269This leads to a situation where stale timestamps are only problematic in some cases.
    270 Furthermore, stale timestamps can be desirable since lower freshness requirements mean less cache invalidations.
     270Furthermore, stale timestamps can be desirable since lower freshness requirements mean fewer cache invalidations.
    271271
    272272Figure~\ref{fig:base-ts2} shows a solution with a second array containing a copy of the timestamps and average.
    273 This copy is updated \emph{after} the subqueue's critical sections using relaxed atomics.
     273This copy is updated \emph{after} the sub-queue's critical sections using relaxed atomics.
    274274\Glspl{proc} now check if polling is needed by comparing the copy of the remote timestamp instead of the actual timestamp.
    275275The result is that since there is no fencing, the writes can be buffered in the hardware and cause fewer cache invalidations.
     
    279279        \input{base_ts2.pstex_t}
    280280        \caption[\CFA design with Redundant Timestamps]{\CFA design with Redundant Timestamps \smallskip\newline An array is added containing a copy of the timestamps.
    281         These timestamps are written to with relaxed atomics, so there is no order among concurrent memory accesses, leading to fewer cache invalidations.}
     281        These timestamps are written-to with relaxed atomics, so there is no order among concurrent memory accesses, leading to fewer cache invalidations.}
    282282        \label{fig:base-ts2}
    283283\end{figure}
     
    285285The correctness argument is somewhat subtle.
    286286The data used for deciding whether or not to poll a queue can be stale as long as it does not cause starvation.
    287 Therefore, it is acceptable if stale data makes queues appear older than they really are but appearing fresher can be a problem.
    288 For the timestamps, this means missing writes to the timestamp is acceptable since they make the head \at look older.
     287Therefore, it is acceptable if stale data makes queues appear older than they are but appearing fresher can be a problem.
     288For the timestamps, this means it is acceptable to miss writes to the timestamp since they make the head \at look older.
    289289For the moving average, as long as the operations are just atomic reads/writes, the average is guaranteed to yield a value that is between the oldest and newest values written.
    290 Therefore, this unprotected read of the timestamp and average satisfy the limited correctness that is required.
     290Therefore, this unprotected read of the timestamp and average satisfies the limited correctness that is required.
    291291
    292292With redundant timestamps, this scheduling algorithm achieves both the fairness and performance requirements on most machines.
    293293The problem is that the cost of polling and helping is not necessarily consistent across each \gls{hthrd}.
    294 For example, on machines with a CPU containing multiple hyperthreads and cores and multiple CPU sockets, cache misses can be satisfied from the caches on same (local) CPU, or by a CPU on a different (remote) socket.
     294For example on machines with a CPU containing multiple hyper threads and cores and multiple CPU sockets, cache misses can be satisfied from the caches on the same (local) CPU, or by a CPU on a different (remote) socket.
    295295Cache misses satisfied by a remote CPU have significantly higher latency than from the local CPU.
    296296However, these delays are not specific to systems with multiple CPUs.
     
    313313In Figure~\ref{fig:cache-share}, all cache misses are either private to a CPU or shared with another CPU.
    314314This means latency due to cache misses is fairly consistent.
    315 In contrast, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by either instance of L3 cache.
     315In contrast, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by either instance of the L3 cache.
    316316However, the memory-access latency to the remote L3 is higher than the memory-access latency to the local L3.
    317317The impact of these different designs on this algorithm is that scheduling only scales well on architectures with a wide L3 cache, similar to Figure~\ref{fig:cache-share}, and less well on architectures with many narrower L3 cache instances, similar to Figure~\ref{fig:cache-noshare}.
    318 Hence, as the number of L3 instances grow, so too does the chance that the random helping causes significant cache latency.
    319 The solution is for the scheduler be aware of the cache topology.
     318Hence, as the number of L3 instances grows, so too does the chance that the random helping causes significant cache latency.
     319The solution is for the scheduler to be aware of the cache topology.
    320320
    321321\subsection{Per CPU Sharding}
     
    323323Unfortunately, there is no portable way to discover cache topology, and it is outside the scope of this thesis to solve this problem.
    324324This work uses the cache topology information from Linux's @/sys/devices/system/cpu@ directory.
    325 This leaves the challenge of matching \procs to cache structure, or more precisely identifying which subqueues of the ready queue are local to which subcomponents of the cache structure.
    326 Once a matching is generated, the helping algorithm is changed to add bias so that \procs more often help subqueues local to the same cache substructure.\footnote{
     325This leaves the challenge of matching \procs to cache structure, or more precisely identifying which sub-queues of the ready queue are local to which subcomponents of the cache structure.
     326Once a match is generated, the helping algorithm is changed to add bias so that \procs more often help sub-queues local to the same cache substructure.\footnote{
    327327Note that like other biases mentioned in this section, the actual bias value does not appear to need precise tuning.}
    328328
    329 The simplest approach for mapping subqueues to cache structure is to statically tie subqueues to CPUs.
    330 Instead of having each subqueue local to a specific \proc, the system is initialized with subqueues for each hardware hyperthread/core up front.
    331 Then \procs dequeue and enqueue by first asking which CPU id they are executing on, in order to identify which subqueues are the local ones.
     329The simplest approach for mapping sub-queues to cache structure is to statically tie sub-queues to CPUs.
     330Instead of having each sub-queue local to a specific \proc, the system is initialized with sub-queues for each hardware hyperthread/core up front.
     331Then \procs dequeue and enqueue by first asking which CPU id they are executing on, to identify which sub-queues are the local ones.
    332332\Glspl{proc} can get the CPU id from @sched_getcpu@ or @librseq@.
    333333
    334334This approach solves the performance problems on systems with topologies with narrow L3 caches, similar to Figure \ref{fig:cache-noshare}.
    335335However, it can still cause some subtle fairness problems in systems with few \procs and many \glspl{hthrd}.
    336 In this case, the large number of subqueues and the bias against subqueues tied to different cache substructures make it unlikely that every subqueue is picked.
    337 To make things worst, the small number of \procs mean that few helping attempts are made.
    338 This combination of low selection and few helping attempts allow a \at to become stranded on a subqueue for a long time until it gets randomly helped.
    339 On a system with 2 \procs, 256 \glspl{hthrd} with narrow cache sharing, and a 100:1 bias, it can actually take multiple seconds for a \at to get dequeued from a remote queue.
    340 Therefore, a more dynamic matching of subqueues to cache instance is needed.
     336In this case, the large number of sub-queues and the bias against sub-queues tied to different cache substructures make it unlikely that every sub-queue is picked.
     337To make things worst, the small number of \procs means that few helping attempts are made.
     338This combination of low selection and few helping attempts allow a \at to become stranded on a sub-queue for a long time until it gets randomly helped.
     339On a system with 2 \procs, 256 \glspl{hthrd} with narrow cache sharing, and a 100:1 bias, it can take multiple seconds for a \at to get dequeued from a remote queue.
     340Therefore, a more dynamic match of sub-queues to cache instances is needed.
    341341
    342342\subsection{Topological Work Stealing}
    343343\label{s:TopologicalWorkStealing}
    344 Therefore, the approach used in the \CFA scheduler is to have per-\proc subqueues, but have an explicit data-structure track which cache substructure each subqueue is tied to.
     344The approach used in the \CFA scheduler is to have per-\proc sub-queues, but have an explicit data structure to track which cache substructure each sub-queue is tied to.
    345345This tracking requires some finesse because reading this data structure must lead to fewer cache misses than not having the data structure in the first place.
    346 A key element however is that, like the timestamps for helping, reading the cache instance mapping only needs to give the correct result \emph{often enough}.
     346A key element, however, is that, like the timestamps for helping, reading the cache instance mapping only needs to give the correct result \emph{often enough}.
    347347Therefore the algorithm can be built as follows: before enqueueing or dequeuing a \at, each \proc queries the CPU id and the corresponding cache instance.
    348 Since subqueues are tied to \procs, each \proc can then update the cache instance mapped to the local subqueue(s).
    349 To avoid unnecessary cache line invalidation, the map is only written to if the mapping changes.
     348Since sub-queues are tied to \procs, each \proc can then update the cache instance mapped to the local sub-queue(s).
     349To avoid unnecessary cache line invalidation, the map is only written-to if the mapping changes.
    350350
    351351This scheduler is used in the remainder of the thesis for managing CPU execution, but additional scheduling is needed to handle long-term blocking and unblocking, such as I/O.
  • doc/theses/thierry_delisle_PhD/thesis/text/eval_macro.tex

    rebf8ca5 r23a08aa0  
    22The previous chapter demonstrated the \CFA scheduler achieves its equivalent performance goal in small and controlled \at-scheduling scenarios.
    33The next step is to demonstrate performance stays true in more realistic and complete scenarios.
    4 Therefore, this chapter exercises both \at and I/O scheduling using two flavours of webservers that demonstrate \CFA performs competitively with production environments.
    5 
    6 Webservers are chosen because they offer fairly simple applications that perform complex I/O, both network and disk, and are useful as standalone products.
    7 Furthermore, webservers are generally amenable to parallelization since their workloads are mostly homogeneous.
    8 Therefore, webservers offer a stringent performance benchmark for \CFA.
    9 Indeed, existing webservers have close to optimal performance, while the homogeneity of the workload means fairness may not be a problem.
    10 As such, these experiments should highlight the overhead tue to any \CFA fairness cost in realistic scenarios.
     4Therefore, this chapter exercises both \at and I/O scheduling using two flavours of web servers that demonstrate \CFA performs competitively compared to web servers used in production environments.
     5
     6Web servers are chosen because they offer fairly simple applications that perform complex I/O, both network and disk, and are useful as standalone products.
     7Furthermore, web servers are generally amenable to parallelization since their workloads are mostly homogeneous.
     8Therefore, web servers offer a stringent performance benchmark for \CFA.
     9Indeed, existing web servers have close to optimal performance, while the homogeneity of the workload means fairness may not be a problem.
     10As such, these experiments should highlight the overhead due to any \CFA fairness cost in realistic scenarios.
    1111
    1212\section{Memcached}
    1313Memcached~\cite{memcached} is an in-memory key-value store used in many production environments, \eg \cite{atikoglu2012workload}.
    14 In fact, the Memcached server is so popular there exists a full-featured front-end for performance testing, called @mutilate@~\cite{GITHUB:mutilate}.
    15 Experimenting on Memcached allows for a simple test of the \CFA runtime as a whole, exercising the scheduler, the idle-sleep mechanism, as well the \io subsystem for sockets.
    16 Note, this experiment does not exercise the \io subsystem with regards to disk operations because Memcached is an in-memory server.
     14The Memcached server is so popular there exists a full-featured front-end for performance testing, called @mutilate@~\cite{GITHUB:mutilate}.
     15Experimenting on Memcached allows for a simple test of the \CFA runtime as a whole, exercising the scheduler, the idle-sleep mechanism, as well as the \io subsystem for sockets.
     16Note that this experiment does not exercise the \io subsystem with regard to disk operations because Memcached is an in-memory server.
    1717
    1818\subsection{Benchmark Environment}
     
    2424Each node has 2 Intel(R) Xeon(R) CPU E5-2620 v2 running at 2.10GHz.
    2525\item
    26 These CPUs have 6 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 24 \glspl{hthrd}.
    27 \item
    28 The CPUs each have 384 KB, 3 MB and 30 MB of L1, L2 and L3 caches respectively.
    29 \item
    30 Each node is connected to the network through a Mellanox 10 Gigabit Ethernet port.
     26Each CPU has 6 cores and 2 \glspl{hthrd} per core, for a total of 24 \glspl{hthrd}.
     27\item
     28A CPU has 384 KB, 3 MB and 30 MB of L1, L2 and L3 caches, respectively.
     29\item
     30The compute nodes are connected to the network through a Mellanox 10 Gigabit Ethernet port.
    3131\item
    3232Network routing is performed by a Mellanox SX1012 10/40 Gigabit Ethernet switch.
     
    3535\subsection{Memcached threading}\label{memcd:thrd}
    3636Memcached can be built to use multiple threads in addition to its @libevent@ subsystem to handle requests.
    37 When enabled, the threading implementation operates as follows~\cite{https://docs.oracle.com/cd/E17952_01/mysql-5.6-en/ha-memcached-using-threads.html}:
     37When enabled, the threading implementation operates as follows~\cite[\S~16.2.2.8]{MemcachedThreading}:
    3838\begin{itemize}
    3939\item
     
    4848For UDP connections, all the threads listen to a single UDP socket for incoming requests.
    4949Threads that are not currently dealing with another request ignore the incoming packet.
    50 One of the remaining, nonbusy, threads reads the request and sends the response.
    51 This implementation can lead to increased CPU load as threads wake from sleep to potentially process the request.
    52 \end{itemize}
    53 Here, Memcached is based on an event-based webserver architecture~\cite{Pai99Flash}, using \gls{kthrd}ing to run multiple largely independent event engines, and if needed, spinning up additional kernel threads to handle blocking I/O.
    54 Alternative webserver architecture are:
     50One of the remaining, non-busy, threads reads the request and sends the response.
     51This implementation can lead to increased CPU \gls{load} as threads wake from sleep to potentially process the request.
     52\end{itemize}
     53Here, Memcached is based on an event-based web server architecture~\cite{Pai99Flash}, using \gls{kthrd}ing to run multiple largely independent event engines, and if needed, spinning up additional kernel threads to handle blocking I/O.
     54Alternative web server architectures are:
    5555\begin{itemize}
    5656\item
     
    7474 \item \emph{vanilla}: the official release of Memcached, version~1.6.9.
    7575 \item \emph{fibre}: a modification of vanilla using the thread-per-connection model on top of the libfibre runtime.
    76  \item \emph{cfa}: a modification of the fibre webserver that replaces the libfibre runtime with \CFA.
     76 \item \emph{cfa}: a modification of the fibre web server that replaces the libfibre runtime with \CFA.
    7777\end{itemize}
    7878
     
    8080This experiment is done by having the clients establish 15,360 total connections, which persist for the duration of the experiment.
    8181The clients then send read and write queries with only 3\% writes (updates), attempting to follow a desired query rate, and the server responds to the desired rate as best as possible.
    82 Figure~\ref{fig:memcd:rate:qps} shows the 3 server versions at different client rates, ``Target \underline{Q}ueries \underline{P}er \underline{S}econd'', and the actual rate, ``Actual QPS'', for all three webservers.
    83 
    84 Like the experimental setup in Chapter~\ref{microbench}, each experiment is run 15 times, and for each client rate, the measured webserver rate is plotted.
     82Figure~\ref{fig:memcd:rate:qps} shows the 3 server versions at different client rates, ``Target \underline{Q}ueries \underline{P}er \underline{S}econd'', and the actual rate, ``Actual QPS'', for all three web servers.
     83
     84Like the experimental setup in Chapter~\ref{microbench}, each experiment is run 15 times, and for each client rate, the measured web server rate is plotted.
    8585The solid line represents the median while the dashed and dotted lines represent the maximum and minimum respectively.
    86 For rates below 500K queries per seconds, all three webservers match the client rate.
    87 Beyond 500K, the webservers cannot match the client rate.
    88 During this interval, vanilla Memcached achieves the highest webserver throughput, with libfibre and \CFA slightly lower but very similar throughput.
    89 Overall the performance of all three webservers is very similar, especially considering that at 500K the servers have reached saturation, which is discussed more in the next section.
     86For rates below 500K queries per second, all three web servers match the client rate.
     87Beyond 500K, the web servers cannot match the client rate.
     88During this interval, vanilla Memcached achieves the highest web server throughput, with libfibre and \CFA slightly lower but very similar throughput.
     89Overall the performance of all three web servers is very similar, especially considering that at 500K the servers have reached saturation, which is discussed more in the next section.
    9090
    9191\begin{figure}
    9292        \centering
    9393        \resizebox{0.83\linewidth}{!}{\input{result.memcd.rate.qps.pstex_t}}
    94         \caption[Memcached Benchmark: Throughput]{Memcached Benchmark: Throughput\smallskip\newline Desired vs Actual query rate for 15,360 connections. Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server is able to respond.}
     94        \caption[Memcached Benchmark: Throughput]{Memcached Benchmark: Throughput\smallskip\newline Desired vs Actual query rate for 15,360 connections. Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server can respond.}
    9595        \label{fig:memcd:rate:qps}
    9696%\end{figure}
     
    9999        \centering
    100100        \resizebox{0.83\linewidth}{!}{\input{result.memcd.rate.99th.pstex_t}}
    101         \caption[Memcached Benchmark : 99th Percentile Lantency]{Memcached Benchmark : 99th Percentile Lantency\smallskip\newline 99th Percentile of the response latency as a function of \emph{desired} query rate for 15,360 connections. }
     101        \caption[Memcached Benchmark: 99th Percentile Latency]{Memcached Benchmark: 99th Percentile Latency\smallskip\newline 99th Percentile of the response latency as a function of \emph{desired} query rate for 15,360 connections. }
    102102        \label{fig:memcd:rate:tail}
    103103\end{figure}
    104104
    105105\subsection{Tail Latency}
    106 Another popular performance metric is \newterm{tail} latency, which indicates some notion of fairness among requests across the experiment, \ie do some requests wait longer than other requests for service.
     106Another popular performance metric is \newterm{tail} latency, which indicates some notion of fairness among requests across the experiment, \ie do some requests wait longer than other requests for service?
    107107Since many web applications rely on a combination of different queries made in parallel, the latency of the slowest response, \ie tail latency, can dictate a performance perception.
    108108Figure~\ref{fig:memcd:rate:tail} shows the 99th percentile latency results for the same Memcached experiment.
    109109
    110110Again, each experiment is run 15 times with the median, maximum and minimum plotted with different lines.
    111 As expected, the latency starts low and increases as the server gets close to saturation, at which point, the latency increases dramatically because the webservers cannot keep up with the connection rate so client requests are disproportionally delayed.
    112 Because of this dramatic increase, the Y axis is presented using log scale.
    113 Note that the graph shows \emph{target} query rate, the actual response rate is given in Figure~\ref{fig:memcd:rate:qps} as this is the same underlying experiment.
    114 
    115 For all three servers, the saturation point is reached before 500K queries per second, which is when throughput starts to decline among the webservers.
    116 In this experiment, all three webservers are much more distinguishable than the throughput experiment.
    117 Vanilla Memcached achieves the lowest latency until 600K, after which all the webservers are struggling to respond to client requests.
     111As expected, the latency starts low and increases as the server gets close to saturation, at which point, the latency increases dramatically because the web servers cannot keep up with the connection rate so client requests are disproportionally delayed.
     112Because of this dramatic increase, the Y-axis is presented using a log scale.
     113Note that the graph shows the \emph{target} query rate, the actual response rate is given in Figure~\ref{fig:memcd:rate:qps} as this is the same underlying experiment.
     114
     115For all three servers, the saturation point is reached before 500K queries per second, which is when throughput starts to decline among the web servers.
     116In this experiment, all three web servers are much more distinguishable than in the throughput experiment.
     117Vanilla Memcached achieves the lowest latency until 600K, after which all the web servers are struggling to respond to client requests.
    118118\CFA begins to decline at 600K, indicating some bottleneck after saturation.
    119 Overall, all three webservers achieve micro-second latencies and the increases in latency mostly follow each other.
     119Overall, all three web servers achieve microsecond latencies and the increases in latency mostly follow each other.
    120120
    121121\subsection{Update rate}
    122 Since Memcached is effectively a simple database, the information that is cached can be written to concurrently by multiple queries.
     122Since Memcached is effectively a simple database, the cache information can be written to concurrently by multiple queries.
    123123And since writes can significantly affect performance, it is interesting to see how varying the update rate affects performance.
    124124Figure~\ref{fig:memcd:updt} shows the results for the same experiment as the throughput and latency experiment but increasing the update percentage to 5\%, 10\% and 50\%, respectively, versus the original 3\% update percentage.
    125125
    126126\begin{figure}
     127        \hspace{-15pt}
    127128        \subfloat[][\CFA: Throughput]{
    128129                \resizebox{0.5\linewidth}{!}{
     
    132133        }
    133134        \subfloat[][\CFA: Latency]{
    134                 \resizebox{0.5\linewidth}{!}{
     135                \resizebox{0.52\linewidth}{!}{
    135136                        \input{result.memcd.forall.lat.pstex_t}
    136137                }
     
    138139        }
    139140
     141        \hspace{-15pt}
    140142        \subfloat[][LibFibre: Throughput]{
    141143                \resizebox{0.5\linewidth}{!}{
     
    145147        }
    146148        \subfloat[][LibFibre: Latency]{
    147                 \resizebox{0.5\linewidth}{!}{
     149                \resizebox{0.52\linewidth}{!}{
    148150                        \input{result.memcd.fibre.lat.pstex_t}
    149151                }
     
    151153        }
    152154
     155        \hspace{-15pt}
    153156        \subfloat[][Vanilla: Throughput]{
    154157                \resizebox{0.5\linewidth}{!}{
     
    158161        }
    159162        \subfloat[][Vanilla: Latency]{
    160                 \resizebox{0.5\linewidth}{!}{
     163                \resizebox{0.52\linewidth}{!}{
    161164                        \input{result.memcd.vanilla.lat.pstex_t}
    162165                }
    163166                \label{fig:memcd:updt:vanilla:lat}
    164167        }
    165         \caption[Throughput and Latency results at different update rates (percentage of writes).]{Throughput and Latency results at different update rates (percentage of writes).\smallskip\newline Description}
     168        \caption[Throughput and Latency results at different update rates (percentage of writes).]{Throughput and Latency results at different update rates (percentage of writes).\smallskip\newline On the left, throughput as Desired vs Actual query rate.
     169        Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server can respond.
     170        On the right, tail latency, \ie 99th Percentile of the response latency as a function of \emph{desired} query rate.
     171        For throughput, higher is better, for tail-latency, lower is better.
     172        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
     173        All runs have 15,360 client connections.
    166174        \label{fig:memcd:updt}
    167175\end{figure}
     
    175183\section{Static Web-Server}
    176184The Memcached experiment does not exercise two key aspects of the \io subsystem: accept\-ing new connections and interacting with disks.
    177 On the other hand, a webserver servicing static web-pages does stress both accepting connections and disk \io by accepting tens of thousands of client requests per second where these requests return static data serviced from the file-system cache or disk.\footnote{
    178 Webservers servicing dynamic requests, which read from multiple locations and construct a response, are not as interesting since creating the response takes more time and does not exercise the runtime in a meaningfully different way.}
    179 The static webserver experiment compares NGINX~\cite{nginx} with a custom \CFA-based webserver developed for this experiment.
     185On the other hand, a web server servicing static web pages does stress both accepting connections and disk \io by accepting tens of thousands of client requests per second where these requests return static data serviced from the file-system cache or disk.\footnote{
     186web servers servicing dynamic requests, which read from multiple locations and construct a response, are not as interesting since creating the response takes more time and does not exercise the runtime in a meaningfully different way.}
     187The static web server experiment compares NGINX~\cite{nginx} with a custom \CFA-based web server developed for this experiment.
    180188
    181189\subsection{NGINX threading}
    182 Like memcached, NGINX can be makde to use multiple \glspl{kthrd}.
    183 It has a very similar architecture to the memcached architecture decscribed in Section~\ref{memcd:thrd}, where multiple \glspl{kthrd} each run a mostly independent network logic.
    184 While it does not necessarily use a dedicated listening thread, each connection is arbitrarily assigned to one of the \newterm{worker} threads.
    185 Each worker threads handles multiple connections exclusively, effectively dividing the connections into distinct sets.
    186 Again, this is effectively the \emph{event-based server} approach.
    187 
    188 \cit{https://www.nginx.com/blog/inside-nginx-how-we-designed-for-performance-scale/}
    189 
    190 
    191 \subsection{\CFA webserver}
    192 The \CFA webserver is a straightforward thread-per-connection webserver, where a fixed number of \ats are created upfront.
     190NGINX is a high-performance, \emph{full-service}, event-driven web server.
     191It can handle both static and dynamic web content, as well as serve as a reverse proxy and a load balancer~\cite{reese2008nginx}.
     192This wealth of capabilities comes with a variety of potential configurations, dictating available features and performance.
     193The NGINX server runs a master process that performs operations such as reading configuration files, binding to ports, and controlling worker processes.
     194When running as a static web server, it uses an event-driven architecture to service incoming requests.
     195Incoming connections are assigned a \emph{stackless} HTTP state machine and worker processes can handle thousands of these state machines.
     196For the following experiment, NGINX is configured to use @epoll@ to listen for events on these state machines and have each worker process independently accept new connections.
     197Because of the realities of Linux, see Subsection~\ref{ononblock}, NGINX also maintains a pool of auxiliary threads to handle blocking \io.
     198The configuration can set the number of worker processes desired, as well as the size of the auxiliary pool.
     199However, for the following experiments, NGINX is configured to let the master process decide the appropriate number of threads.
     200
     201\subsection{\CFA web server}
     202The \CFA web server is a straightforward thread-per-connection web server, where a fixed number of \ats are created upfront.
    193203Each \at calls @accept@, through @io_uring@, on the listening port and handles the incoming connection once accepted.
    194204Most of the implementation is fairly straightforward;
    195205however, the inclusion of file \io found an @io_uring@ problem that required an unfortunate workaround.
    196206
    197 Normally, webservers use @sendfile@~\cite{MAN:sendfile} to send files over a socket because it performs a direct move in the kernel from the file-system cache to the NIC, eliminating reading/writing the file into the webserver.
    198 While @io_uring@ does not support @sendfile@, it does supports @splice@~\cite{MAN:splice}, which is strictly more powerful.
    199 However, because of how Linux implements file \io, see Subsection~\ref{ononblock}, @io_uring@ must delegate splice calls to worker threads inside the kernel.
     207Normally, web servers use @sendfile@~\cite{MAN:sendfile} to send files over a socket because it performs a direct move in the kernel from the file-system cache to the NIC, eliminating reading/writing the file into the web server.
     208While @io_uring@ does not support @sendfile@, it does support @splice@~\cite{MAN:splice}, which is strictly more powerful.
     209However, because of how Linux implements file \io, see Subsection~\ref{ononblock}, @io_uring@ must delegate splice calls to worker threads \emph{inside} the kernel.
    200210As of Linux 5.13, @io_uring@ had no mechanism to restrict the number of worker threads, and therefore, when tens of thousands of splice requests are made, it correspondingly creates tens of thousands of internal \glspl{kthrd}.
    201211Such a high number of \glspl{kthrd} slows Linux significantly.
    202 Rather than abandon the experiment, the \CFA webserver was switched to @sendfile@.
    203 
    204 With a blocking @sendfile@ the \CFA achieves acceptable performance until saturation is reached.
    205 At saturation, latency increases so some client connections timeout.
     212Rather than abandon the experiment, the \CFA web server was switched to @sendfile@.
     213
     214Starting with \emph{blocking} @sendfile@, \CFA achieves acceptable performance until saturation is reached.
     215At saturation, latency increases and client connections begin to timeout.
    206216As these clients close their connection, the server must close its corresponding side without delay so the OS can reclaim the resources used by these connections.
    207217Indeed, until the server connection is closed, the connection lingers in the CLOSE-WAIT TCP state~\cite{rfc:tcp} and the TCP buffers are preserved.
    208 However, this poses a problem using nonblocking @sendfile@ calls:
     218However, this poses a problem using blocking @sendfile@ calls:
    209219when @sendfile@ blocks, the \proc rather than the \at blocks, preventing other connections from closing their sockets.
    210220The call can block if there is insufficient memory, which can be caused by having too many connections in the CLOSE-WAIT state.\footnote{
    211221\lstinline{sendfile} can always block even in nonblocking mode if the file to be sent is not in the file-system cache, because Linux does not provide nonblocking disk I/O.}
    212 This effect results in a negative feedback where more timeouts lead to more @sendfile@ calls running out of resources.
    213 
    214 Normally, this is address by using @select@/@epoll@ to wait for sockets to have sufficient resources.
    215 However, since @io_uring@ respects nonblocking semantics, marking all sockets as non-blocking effectively circumvents the @io_uring@ subsystem entirely:
    216 all calls would simply immediately return @EAGAIN@ and all asynchronicity would be lost.
    217 
    218 For this reason, the \CFA webserver sets and resets the @O_NONBLOCK@ flag before and after any calls to @sendfile@.
     222This effect results in a negative feedback loop where more timeouts lead to more @sendfile@ calls running out of resources.
     223
     224Normally, this problem is addressed by using @select@/@epoll@ to wait for sockets to have sufficient resources.
     225However, since @io_uring@ does not support @sendfile@ but does respect non\-blocking semantics, marking all sockets as non-blocking effectively circumvents the @io_uring@ subsystem entirely:
     226all calls simply immediately return @EAGAIN@ and all asynchronicity is lost.
     227
     228Switching the entire \CFA runtime to @epoll@ for this experiment is unrealistic and does not help in the evaluation of the \CFA runtime.
     229For this reason, the \CFA web server sets and resets the @O_NONBLOCK@ flag before and after any calls to @sendfile@.
    219230However, when the nonblocking @sendfile@ returns @EAGAIN@, the \CFA server cannot block the \at because its I/O subsystem uses @io_uring@.
    220 Therefore, the \at must spin performing the @sendfile@ and yield if the call returns @EAGAIN@.
    221 Normally @epoll@ would also be used when these calls to @sendfile@ return @EAGAIN@, but since this would not help in the evaluation of the \CFA runtime, the \CFA webserver simply yields and retries in these cases.
    222 
    223 Interestingly, Linux 5.15 @io_uring@ introduces the ability to limit the number of worker threads that are created, through the @IORING_REGISTER_IOWQ_MAX_WORKERS@ option.
    224 Presumably, this limit could prevent the explosion of \glspl{kthrd} which justified using @sendfile@ over @io_uring@ and @splice@.
     231Therefore, the \at spins performing the @sendfile@, yields if the call returns @EAGAIN@ and retries in these cases.
     232
     233Interestingly, Linux 5.15 @io_uring@ introduces the ability to limit the number of worker threads that are created through the @IORING_REGISTER_IOWQ_MAX_WORKERS@ option.
     234Presumably, this limit would prevent the explosion of \glspl{kthrd}, which justified using @sendfile@ over @io_uring@ and @splice@.
    225235However, recall from Section~\ref{iouring} that @io_uring@ maintains two pools of workers: bounded workers and unbounded workers.
    226 In the particular case of the webserver, we would want the unbounded workers to handle accepts and reads on socket and bounded workers to handle reading the files from disk.
    227 This would allow fine grained countrol over the number of workers needed for each operation type and would presumably lead to good performance.
     236For a web server, the unbounded workers should handle accepts and reads on sockets, and the bounded workers should handle reading files from disk.
     237This setup allows fine-grained control over the number of workers needed for each operation type and presumably leads to good performance.
     238
    228239However, @io_uring@ must contend with another reality of Linux: the versatility of @splice@.
    229 Indeed, @splice@ can be used both for reading and writing, to or from any type of file descriptor.
    230 This makes it more ambiguous which pool @io_uring@ should delegate @splice@ calls to.
    231 In the case of splicing from a socket to pipe, @splice@ will behave like an unbounded operation, but when splicing from a regular file to a pipe, @splice@ becomes a bounded operation.
    232 To make things more complicated, @splice@ can read from a pipe and write out to a regular file.
     240Indeed, @splice@ can be used both for reading and writing to or from any type of file descriptor.
     241This generality makes it ambiguous which pool @io_uring@ should delegate @splice@ calls to.
     242In the case of splicing from a socket to a pipe, @splice@ behaves like an unbounded operation, but when splicing from a regular file to a pipe, @splice@ becomes a bounded operation.
     243To make things more complicated, @splice@ can read from a pipe and write to a regular file.
    233244In this case, the read is an unbounded operation but the write is a bounded one.
    234245This leaves @io_uring@ in a difficult situation where it can be very difficult to delegate splice operations to the appropriate type of worker.
    235 Since there is little to no context available to @io_uring@, I believe it makes the decision to always delegate @splice@ operations to the unbounded workers.
    236 This is unfortunate for this specific experiment, since it prevents the webserver from limiting the number of calls to @splice@ happening in parallel without affecting the performance of @read@ or @accept@.
     246Since there is little or no context available to @io_uring@, it seems to always delegate @splice@ operations to the unbounded workers.
     247This decision is unfortunate for this specific experiment since it prevents the web server from limiting the number of parallel calls to @splice@ without affecting the performance of @read@ or @accept@.
    237248For this reason, the @sendfile@ approach described above is still the most performant solution in Linux 5.15.
    238249
    239 Note that it could be possible to workaround this problem, for example by creating more @io_uring@ instances so @splice@ operations can be issued to a different instance than the @read@ and @accept@ operations.
    240 However, I do not believe this solution is appropriate in general, it simply replaces a hack in the webserver with a different, equivalent hack.
     250One possible workaround is to create more @io_uring@ instances so @splice@ operations can be issued to a different instance than the @read@ and @accept@ operations.
     251However, I do not believe this solution is appropriate in general;
     252it simply replaces my current web server hack with a different, equivalent hack.
    241253
    242254\subsection{Benchmark Environment}
    243 Unlike the Memcached experiment, the webserver experiment is run on a heterogeneous environment.
     255Unlike the Memcached experiment, the web server experiment is run on a heterogeneous environment.
    244256\begin{itemize}
    245257\item
    246258The server runs Ubuntu 20.04.4 LTS on top of Linux Kernel 5.13.0-52.
    247259\item
    248 It has an AMD Opteron(tm) Processor 6380 running at 2.5GHz.
     260The server computer has four AMD Opteron\texttrademark Processor 6380 with 16 cores running at 2.5GHz, for a total of 64 \glspl{hthrd}.
     261\item
     262The computer is booted with only 8 CPUs enabled, which is sufficient to achieve line rate.
    249263\item
    250264Each CPU has 64 KB, 256 KiB and 8 MB of L1, L2 and L3 caches respectively.
    251265\item
    252 The computer is booted with only 8 CPUs enabled, which is sufficient to achieve line rate.
    253 \item
    254266The computer is booted with only 25GB of memory to restrict the file-system cache.
    255267\end{itemize}
     
    257269\begin{itemize}
    258270\item
    259 A client runs a 2.6.11-1 SMP Linux kernel, which permits each client load-generator to run on a separate CPU.
     271A client runs a 2.6.11-1 SMP Linux kernel, which permits each client load generator to run on a separate CPU.
    260272\item
    261273It has two 2.8 GHz Xeon CPUs, and four one-gigabit Ethernet cards.
    262274\item
    263 \todo{switch}
     275Network routing is performed by an HP 2530 10 Gigabit Ethernet switch.
    264276\item
    265277A client machine runs two copies of the workload generator.
    266278\end{itemize}
    267279The clients and network are sufficiently provisioned to drive the server to saturation and beyond.
    268 Hence, any server effects are attributable solely to the runtime system and webserver.
    269 Finally, without restricting the server hardware resources, it is impossible to determine if a runtime system or the webserver using it has any specific design restrictions, \eg using space to reduce time.
    270 Trying to determine these restriction with large numbers of processors or memory simply means running equally large experiments, which takes longer and are harder to set up.
     280Hence, any server effects are attributable solely to the runtime system and web server.
     281Finally, without restricting the server hardware resources, it is impossible to determine if a runtime system or the web server using it has any specific design restrictions, \eg using space to reduce time.
     282Trying to determine these restrictions with large numbers of processors or memory simply means running equally large experiments, which take longer and are harder to set up.
    271283
    272284\subsection{Throughput}
    273 To measure webserver throughput, the server computer is loaded with 21,600 files, sharded across 650 directories, occupying about 2.2GB of disk, distributed over the server's RAID-5 4-drives to achieve high throughput for disk I/O.
     285To measure web server throughput, the server computer is loaded with 21,600 files, sharded across 650 directories, occupying about 2.2GB of disk, distributed over the server's RAID-5 4-drives to achieve high throughput for disk I/O.
    274286The clients run httperf~\cite{httperf} to request a set of static files.
    275 The httperf load-generator is used with session files to simulate a large number of users and to implement a partially open-loop system.
     287The httperf load generator is used with session files to simulate a large number of users and to implement a partially open-loop system.
    276288This permits httperf to produce overload conditions, generate multiple requests from persistent HTTP/1.1 connections, and include both active and inactive off periods to model browser processing times and user think times~\cite{Barford98}.
    277289
    278290The experiments are run with 16 clients, each running a copy of httperf (one copy per CPU), requiring a set of 16 log files with requests conforming to a Zipf distribution.
    279 This distribution is representative of users accessing static data through a web-browser.
    280 Each request reads a file name from its trace, establishes a connection, performs an HTTP get-request for the file name, receive the file data, close the connection, and repeat the process.
     291This distribution is representative of users accessing static data through a web browser.
     292Each request reads a file name from its trace, establishes a connection, performs an HTTP GET request for the file name, receives the file data, closes the connection, and repeats the process.
    281293Some trace elements have multiple file names that are read across a persistent connection.
    282 A client times-out if the server does not complete a request within 10 seconds.
     294A client times out if the server does not complete a request within 10 seconds.
    283295
    284296An experiment consists of running a server with request rates ranging from 10,000 to 70,000 requests per second;
    285297each rate takes about 5 minutes to complete.
    286 There is 20 seconds idle time between rates and between experiments to allow connections in the TIME-WAIT state to clear.
     298There are 20 seconds of idle time between rates and between experiments to allow connections in the TIME-WAIT state to clear.
    287299Server throughput is measured both at peak and after saturation (\ie after peak).
    288300Peak indicates the level of client requests the server can handle and after peak indicates if a server degrades gracefully.
    289 Throughput is measured by aggregating the results from httperf of all the clients.
     301Throughput is measured by aggregating the results from httperf for all the clients.
    290302
    291303This experiment can be done for two workload scenarios by reconfiguring the server with different amounts of memory: 25 GB and 2.5 GB.
     
    305317\end{table}
    306318
    307 Figure~\ref{fig:swbsrv} shows the results comparing \CFA to NGINX in terms of throughput.
    308 These results are fairly straightforward.
    309 Both servers achieve the same throughput until around 57,500 requests per seconds.
    310 Since the clients are asking for the same files, the fact that the throughput matches exactly is expected as long as both servers are able to serve the desired rate.
    311 Once the saturation point is reached, both servers are still very close.
    312 NGINX achieves slightly better throughput.
    313 However, Figure~\ref{fig:swbsrv:err} shows the rate of errors, a gross approximation of tail latency, where \CFA achieves notably fewer errors once the machine reaches saturation.
    314 This suggest that \CFA is slightly more fair and NGINX may slightly sacrifice some fairness for improved throughput.
    315 It demonstrate that the \CFA webserver described above is able to match the performance of NGINX up-to and beyond the saturation point of the machine.
    316 
    317319\begin{figure}
     320        \centering
    318321        \subfloat[][Throughput]{
    319322                \resizebox{0.85\linewidth}{!}{\input{result.swbsrv.25gb.pstex_t}}
     
    325328                \label{fig:swbsrv:err}
    326329        }
    327         \caption[Static Webserver Benchmark : Throughput]{Static Webserver Benchmark : Throughput\smallskip\newline Throughput vs request rate for short lived connections connections.}
     330        \caption[Static web server Benchmark: Throughput]{Static web server Benchmark: Throughput\smallskip\newline Throughput vs request rate for short-lived connections.}
    328331        \label{fig:swbsrv}
    329332\end{figure}
    330333
     334Figure~\ref{fig:swbsrv} shows the results comparing \CFA to NGINX in terms of throughput.
     335These results are fairly straightforward.
     336Both servers achieve the same throughput until around 57,500 requests per second.
     337Since the clients are asking for the same files, the fact that the throughput matches exactly is expected as long as both servers are able to serve the request rate.
     338Once the saturation point is reached, both servers are still very close.
     339NGINX achieves slightly better throughput.
     340However, Figure~\ref{fig:swbsrv:err} shows the rate of errors, a gross approximation of tail latency, where \CFA achieves notably fewer errors once the servers reach saturation.
     341This suggests \CFA is slightly fairer with less throughput, while NGINX sacrifices fairness for more throughput.
     342This experiment demonstrates that the \CFA web server is able to match the performance of NGINX up to and beyond the saturation point of the machine.
     343
    331344\subsection{Disk Operations}
    332 The throughput was made using a server with 25gb of memory, this was sufficient to hold the entire fileset in addition to all the code and data needed to run the webserver and the rest of the machine.
    333 Previous work like \cit{Cite Ashif's stuff} demonstrate that an interesting follow-up experiment is to rerun the same throughput experiment but allowing significantly less memory on the machine.
    334 If the machine is constrained enough, it will force the OS to evict files from the file cache and cause calls to @sendfile@ to have to read from disk.
    335 However, in this configuration, the problem with @splice@ and @io_uring@ rears its ugly head again.
     345With 25GB of memory, the entire experimental file-set plus the web server and OS fit in memory.
     346If memory is constrained, the OS must evict files from the file cache, which causes @sendfile@ to read from disk.\footnote{
     347For the in-memory experiments, the file-system cache was warmed by running an experiment three times before measuring started to ensure all files are in the file-system cache.}
     348web servers can behave very differently once file I/O begins and increases.
     349Hence, prior work~\cite{Harji10} suggests running both kinds of experiments to test overall web server performance.
     350
     351However, after reducing memory to 2.5GB, the problem with @splice@ and @io_uring@ rears its ugly head again.
    336352Indeed, in the in-memory configuration, replacing @splice@ with calls to @sendfile@ works because the bounded side basically never blocks.
    337353Like @splice@, @sendfile@ is in a situation where the read side requires bounded blocking, \eg reading from a regular file, while the write side requires unbounded blocking, \eg blocking until the socket is available for writing.
    338 The unbounded side can be handled by yielding when it returns @EAGAIN@ like mentioned above, but this trick does not work for the bounded side.
     354The unbounded side can be handled by yielding when it returns @EAGAIN@, as mentioned above, but this trick does not work for the bounded side.
    339355The only solution for the bounded side is to spawn more threads and let these handle the blocking.
    340356
    341 Supporting this case in the webserver would require creating more \procs or creating a dedicated thread-pool.
    342 However, since what I am to evaluate in this thesis is the runtime of \CFA, I decided to forgo experiments on low memory server.
    343 The implementation of the webserver itself is simply too impactful to be an interesting evaluation of the underlying runtime.
     357Supporting this case in the web server would require creating more \procs or creating a dedicated thread pool.
     358However, I felt this kind of modification moves too far away from my goal of evaluating the \CFA runtime, \ie it begins writing another runtime system;
     359hence, I decided to forgo experiments on low-memory performance.
  • doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex

    rebf8ca5 r23a08aa0  
    44This chapter presents five different experimental setups for evaluating the basic features of the \CFA, libfibre~\cite{libfibre}, Go, and Tokio~\cite{Tokio} schedulers.
    55All of these systems have a \gls{uthrding} model.
    6 The goal in this chapter is show the \CFA scheduler obtains equivalent performance to other less fair schedulers through the different experiments.
    7 Note, only the code of the \CFA tests is shown;
    8 all tests in the other systems are functionally identical and available online~\cite{SchedulingBenchmarks}.
     6The goal of this chapter is to show that the \CFA scheduler obtains equivalent performance to other, less fair, schedulers through the different experiments.
     7Note that only the code of the \CFA tests is shown;
     8all tests in the other systems are functionally identical and available online~\cite{GITHUB:SchedulingBenchmarks}.
    99
    1010\section{Benchmark Environment}\label{microenv}
     
    1313\begin{description}
    1414\item[AMD] is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM.
    15 The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}.
     15The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for a total of 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}.
    1616Each CPU has 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches, respectively.
    17 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}.
     17Each L1 and L2 instance is only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}.
    1818The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
    1919
     
    2525\end{description}
    2626
    27 For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA node with no hyper threading.
     27For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA node with no hyperthreading.
    2828If more \glspl{hthrd} are needed, then 1 NUMA node with hyperthreading is used.
    2929If still more \glspl{hthrd} are needed, then the experiment is limited to as few NUMA nodes as needed.
     
    3232On AMD, the same algorithm is used, but the machine only has 2 sockets.
    3333So hyperthreading\footnote{
    34 Hyperthreading normally refers specifically to the technique used by Intel, however it is often used generically to refer to any equivalent feature.}
    35 is used when the \proc count reach 65 and 193.
    36 
    37 The limited sharing of the last-level cache on the AMD machine is markedly different than the Intel machine.
     34Hyperthreading normally refers specifically to the technique used by Intel, however, it is often used generically to refer to any equivalent feature.}
     35is used when the \proc count reaches 65 and 193.
     36
     37The limited sharing of the last-level cache on the AMD machine is markedly different from the Intel machine.
    3838Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different CPU incur a significant latency, on the AMD it is also the case that cache misses served by a different L3 instance on the same CPU also incur high latency.
    3939
     
    4242Each experiment is run 15 times varying the number of processors depending on the two different computers.
    4343All experiments gather throughput data and secondary data for scalability or latency.
    44 The data is graphed using a solid and two dashed lines representing the median, maximum and minimum result respectively, where the minimum/maximum lines are referred to as the \emph{extremes}.\footnote{
     44The data is graphed using a solid, a dashed, and a dotted line, representing the median, maximum and minimum results respectively, where the minimum/maximum lines are referred to as the \emph{extremes}.\footnote{
    4545An alternative display is to use error bars with min/max as the bottom/top for the bar.
    4646However, this approach is not truly an error bar around a mean value and I felt the connected lines are easier to read.}
     
    4848
    4949For each experiment, four graphs are generated showing traditional throughput on the top row and \newterm{scalability} or \newterm{latency} on the bottom row (peek ahead to Figure~\ref{fig:cycle:jax}).
    50 Scalability uses the same data as throughput but the Y axis is calculated as the number of \procs over the throughput.
     50Scalability uses the same data as throughput but the Y-axis is calculated as the number of \procs over the throughput.
    5151In this representation, perfect scalability should appear as a horizontal line, \eg, if doubling the number of \procs doubles the throughput, then the relation stays the same.
    5252
    53 The left column shows results for 100 cycles per \proc, enough cycles to always keep every \proc busy.
    54 The right column shows results for 1 cycle per \proc, where the ready queues are expected to be near empty most of the time.
    55 The distinction between 100 and 1 cycles is meaningful because the idle sleep subsystem is expected to matter only in the right column, where spurious effects can cause a \proc to run out of work temporarily.
     53The left column shows results for hundreds of \ats per \proc, enough to always keep every \proc busy.
     54The right column shows results for very few \ats per \proc, where the ready queues are expected to be near empty most of the time.
     55The distinction between many and few \ats is meaningful because the idle sleep subsystem is expected to matter only in the right column, where spurious effects can cause a \proc to run out of work temporarily.
    5656
    5757\section{Cycle}
     
    6262Hence, systems that perform this optimization have an artificial performance benefit because the yield becomes a \emph{nop}.
    6363For this reason, I designed a different push/pop benchmark, called \newterm{Cycle Benchmark}.
    64 This benchmark arranges a number of \ats into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list.
    65 At runtime, each \at unparks the next \at before parking itself.
    66 Unparking the next \at pushes that \at onto the ready queue while the ensuing park leads to a \at being popped from the ready queue.
     64This benchmark arranges several \ats into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list.
     65At runtime, each \at unparks the next \at before \glslink{atblock}{parking} itself.
     66Unparking the next \at pushes that \at onto the ready queue while the ensuing \park leads to a \at being popped from the ready queue.
    6767
    6868\begin{figure}
    6969        \centering
    7070        \input{cycle.pstex_t}
    71         \caption[Cycle benchmark]{Cycle benchmark\smallskip\newline Each \at unparks the next \at in the cycle before parking itself.}
     71        \caption[Cycle benchmark]{Cycle benchmark\smallskip\newline Each \at unparks the next \at in the cycle before \glslink{atblock}{parking} itself.}
    7272        \label{fig:cycle}
    7373\end{figure}
    7474
    7575Therefore, the underlying runtime cannot rely on the number of ready \ats staying constant over the duration of the experiment.
    76 In fact, the total number of \ats waiting on the ready queue is expected to vary because of the race between the next \at unparking and the current \at parking.
     76In fact, the total number of \ats waiting on the ready queue is expected to vary because of the race between the next \at \glslink{atsched}{unparking} and the current \at \glslink{atblock}{parking}.
    7777That is, the runtime cannot anticipate that the current task immediately parks.
    7878As well, the size of the cycle is also decided based on this race, \eg a small cycle may see the chain of unparks go full circle before the first \at parks because of time-slicing or multiple \procs.
    7979If this happens, the scheduler push and pop are avoided and the results of the experiment are skewed.
    80 (Note, an unpark is like a V on a semaphore, so the subsequent park (P) may not block.)
    81 Every runtime system must handle this race and cannot optimized away the ready-queue pushes and pops.
    82 To prevent any attempt of silently omitting ready-queue operations, the ring of \ats is made big enough so the \ats have time to fully park before being unparked again.
     80(Note, an \unpark is like a V on a semaphore, so the subsequent \park (P) may not block.)
     81Every runtime system must handle this race and cannot optimize away the ready-queue pushes and pops.
     82To prevent any attempt of silently omitting ready-queue operations, the ring of \ats is made big enough so the \ats have time to fully \park before being unparked again.
    8383Finally, to further mitigate any underlying push/pop optimizations, especially on SMP machines, multiple rings are created in the experiment.
    8484
    8585Figure~\ref{fig:cycle:code} shows the pseudo code for this benchmark, where each cycle has 5 \ats.
    86 There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw @park@/@unpark@ and carefully picking the order of the @P@ and @V@ with respect to the loop condition.
     86There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw \park/\unpark and carefully picking the order of the @P@ and @V@ with respect to the loop condition.
    8787
    8888\begin{figure}
     
    9999}
    100100\end{cfa}
    101 \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code}
     101\caption[Cycle Benchmark: Pseudo Code]{Cycle Benchmark: Pseudo Code}
    102102\label{fig:cycle:code}
    103 %\end{figure}
    104 
    105103\bigskip
    106 
    107 %\begin{figure}
    108104        \subfloat[][Throughput, 100 cycles per \proc]{
    109105                \resizebox{0.5\linewidth}{!}{
     
    131127                \label{fig:cycle:jax:low:ns}
    132128        }
    133         \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are maximums while the solid line is the medium.}
     129        \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle counts.
     130        For throughput, higher is better, for scalability, lower is better.
     131        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
    134132        \label{fig:cycle:jax}
    135133\end{figure}
     
    161159                \label{fig:cycle:nasus:low:ns}
    162160        }
    163         \caption[Cycle Benchmark on AMD]{Cycle Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
     161        \caption[Cycle Benchmark on AMD]{Cycle Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle counts.
     162        For throughput, higher is better, for scalability, lower is better.
     163        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
    164164        \label{fig:cycle:nasus}
    165165\end{figure}
     
    167167\subsection{Results}
    168168
    169 For the Intel architecture, Figure~\ref{fig:cycle:jax}:
    170 \begin{itemize}
    171 \item
    172 For 100 cycles per \proc (first column), \CFA, Go and Tokio all obtain effectively the same throughput performance.
     169Figures~\ref{fig:cycle:jax} and \ref{fig:cycle:nasus} show the results for the cycle experiment on Intel and AMD, respectively.
     170Looking at the left column on Intel, Figures~\ref{fig:cycle:jax:ops} and \ref{fig:cycle:jax:ns} show the results for 100 cycles of 5 \ats for each \proc.
     171\CFA, Go and Tokio all obtain effectively the same throughput performance.
    173172Libfibre is slightly behind in this case but still scales decently.
    174 As a result of the \gls{kthrd} placement, additional \procs from 25 to 48 offer less performance improvement (flatting of the line) for all runtimes.
    175 As expected, this pattern repeats again between \proc count 72 and 96.
    176 \item
    177 For 1 cycle per \proc, \CFA and Tokio obtain very similar results overall, but Tokio shows more variations in the results.
    178 Go achieves slightly better performance.
     173As a result of the \gls{kthrd} placement, additional \procs from 25 to 48 offer less performance improvement for all runtimes, which can be seen as a flattening of the line.
     174This effect even causes a decrease in throughput in libfibre's case.
     175As expected, this pattern repeats between \proc count 72 and 96.
     176
     177Looking next at the right column on Intel, Figures~\ref{fig:cycle:jax:low:ops} and \ref{fig:cycle:jax:low:ns} show the results for 1 cycle of 5 \ats for each \proc.
     178\CFA and Tokio obtain very similar results overall, but Tokio shows more variations in the results.
     179Go achieves slightly better performance than \CFA and Tokio, but all three display significantly worst performance compared to the left column.
     180This decrease in performance is likely due to the additional overhead of the idle-sleep mechanism.
     181This can either be the result of \procs actually running out of work or simply additional overhead from tracking whether or not there is work available.
     182Indeed, unlike the left column, it is likely that the ready queue is transiently empty, which likely triggers additional synchronization steps.
    179183Interestingly, libfibre achieves better performance with 1 cycle.
    180 \end{itemize}
    181 
    182 For the AMD architecture, Figure~\ref{fig:cycle:nasus}, the results show the same story as on the Intel, with close to double the performance overall but with slightly increased variation.
    183 The different performance improvements and plateaus are due to cache topology and appear at the expected \proc counts of 64, 128 and 192, for the same reasons as on Intel.
    184 \begin{itemize}
    185 \item
    186 For 100 cycles per \proc, unlike Intel, all 4 runtimes achieve very similar throughput and scalability.
    187 \item
    188 For 1 cycle per \proc, unlike on Intel, Tokio and Go have the same throughput performance, while \CFA is slightly slower.
    189 Again, the same performance increase for libfibre is visible.
    190 \end{itemize}
     184
     185Looking now at the results for the AMD architecture, Figure~\ref{fig:cycle:nasus}, the results are overall similar to the Intel results, but with close to double the performance, slightly increased variation, and some differences in the details.
     186Note the maximum of the Y-axis on Intel and AMD differ significantly.
     187Looking at the left column on AMD, Figures~\ref{fig:cycle:nasus:ops} and \ref{fig:cycle:nasus:ns} all 4 runtimes achieve very similar throughput and scalability.
     188However, as the number of \procs grows higher, the results on AMD show notably more variability than on Intel.
     189The different performance improvements and plateaus are due to cache topology and appear at the expected: \proc counts of 64, 128 and 192, for the same reasons as on Intel.
     190Looking next at the right column on AMD, Figures~\ref{fig:cycle:nasus:low:ops} and \ref{fig:cycle:nasus:low:ns}, Tokio and Go have the same throughput performance, while \CFA is slightly slower.
     191This result is different than on Intel, where Tokio behaved like \CFA rather than behaving like Go.
     192Again, the same performance increase for libfibre is visible when running fewer \ats.
    191193Note, I did not investigate the libfibre performance boost for 1 cycle in this experiment.
    192194
    193 The conclusion from both architectures is that all of the compared runtime have fairly equivalent performance for this micro-benchmark.
    194 Clearly, the pathological case with 1 \at per \proc, can affect fairness algorithms managing mostly idle processors, \eg \CFA, but only at high core counts.
    195 For this case, \emph{any} helping is likely to cause a cascade of \procs running out of work and attempting to steal.
    196 For this experiment, the \CFA scheduler has achieved the goal of obtaining equivalent performance to other less fair schedulers, except for very unusual workloads.
     195The conclusion from both architectures is that all of the compared runtimes have fairly equivalent performance for this micro-benchmark.
     196Clearly, the pathological case with 1 cycle per \proc can affect fairness algorithms managing mostly idle processors, \eg \CFA, but only at high core counts.
     197In this case, \emph{any} helping is likely to cause a cascade of \procs running out of work and attempting to steal.
     198For this experiment, the \CFA scheduler has achieved the goal of obtaining equivalent performance to other, less fair, schedulers.
    197199
    198200\section{Yield}
    199201
    200 For completion, the classic yield benchmark is included.
     202For completeness, the classic yield benchmark is included.
    201203Here, the throughput is dominated by the mechanism used to handle the @yield@ function.
    202204Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, where the cycle @wait/next.wake@ is replaced by @yield@.
     
    216218}
    217219\end{cfa}
    218 \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code}
     220\caption[Yield Benchmark: Pseudo Code]{Yield Benchmark: Pseudo Code}
    219221\label{fig:yield:code}
    220222%\end{figure}
     
    227229                \label{fig:yield:jax:ops}
    228230        }
    229         \subfloat[][Throughput, 1 \ats per \proc]{
     231        \subfloat[][Throughput, 1 \at per \proc]{
    230232                \resizebox{0.5\linewidth}{!}{
    231233                \input{result.yield.low.jax.ops.pstex_t}
     
    240242                \label{fig:yield:jax:ns}
    241243        }
    242         \subfloat[][Scalability, 1 \ats per \proc]{
     244        \subfloat[][Scalability, 1 \at per \proc]{
    243245                \resizebox{0.5\linewidth}{!}{
    244246                \input{result.yield.low.jax.ns.pstex_t}
     
    246248                \label{fig:yield:jax:low:ns}
    247249        }
    248         \caption[Yield Benchmark on Intel]{Yield Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, using 1 \ats per \proc. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
     250        \caption[Yield Benchmark on Intel]{Yield Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count.
     251        For throughput, higher is better, for scalability, lower is better.
     252        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
    249253        \label{fig:yield:jax}
    250254\end{figure}
     
    252256\subsection{Results}
    253257
    254 Figures~\ref{fig:yield:jax} and~\ref{fig:yield:nasus} show the same throughput graphs as @cycle@ on Intel and AMD, respectively.
    255 Note, the Y-axis on the yield graph for Intel is twice as large as the Intel cycle-graph.
    256 A visual glance between the cycle and yield graphs confirms my claim that the yield benchmark is unreliable.
    257 
    258 For the Intel architecture, Figure~\ref{fig:yield:jax}:
    259 \begin{itemize}
    260 \item
     258Figures~\ref{fig:yield:jax} and \ref{fig:yield:nasus} show the results for the yield experiment on Intel and AMD, respectively.
     259Looking at the left column on Intel, Figures~\ref{fig:yield:jax:ops} and \ref{fig:yield:jax:ns} show the results for 100 \ats for each \proc.
     260Note that the Y-axis on this graph is twice as large as the Intel cycle graph.
     261A visual glance between the left columns of the cycle and yield graphs confirms my claim that the yield benchmark is unreliable.
    261262\CFA has no special handling for @yield@, but this experiment requires less synchronization than the @cycle@ experiment.
    262 Hence, the @yield@ throughput and scalability graphs for both 100 and 1 cycles/tasks per processor have similar shapes to the corresponding @cycle@ graphs.
    263 The only difference is sightly better performance for @yield@ because of less synchronization.
    264 As for @cycle@, the cost of idle sleep also comes into play in a very significant way in Figure~\ref{fig:yield:jax:low:ns}, where the scaling is not flat.
    265 \item
    266 libfibre has special handling for @yield@ using the fact that the number of ready fibres does not change, and therefore, by-passing the idle-sleep mechanism entirely.
    267 Additionally, when only running 1 \at per \proc, libfibre optimizes further, and forgoes the context-switch entirely.
    268 Hence, libfibre behaves very differently in the cycle and yield benchmarks, with a 4 times increase in performance for 100 cycles/tasks and an 8 times increase for 1 cycle/task.
    269 \item
    270 Go has special handling for @yield@ by putting a yielding goroutine on a secondary global ready-queue, giving it lower priority.
     263Hence, the @yield@ throughput and scalability graphs have similar shapes to the corresponding @cycle@ graphs.
     264The only difference is slightly better performance for @yield@ because of less synchronization.
     265Libfibre has special handling for @yield@ using the fact that the number of ready fibres does not change, and therefore, bypassing the idle-sleep mechanism entirely.
     266Hence, libfibre behaves very differently in the cycle and yield benchmarks, with a 4 times increase in performance on the left column.
     267Go has special handling for @yield@ by putting a yielding goroutine on a secondary global ready-queue, giving it a lower priority.
    271268The result is that multiple \glspl{hthrd} contend for the global queue and performance suffers drastically.
    272 Hence, Go behaves very differently in the cycle and yield benchmarks, with a complete performance collapse in @yield@ for both 100 and 1 cycles/tasks.
    273 \item
     269Hence, Go behaves very differently in the cycle and yield benchmarks, with a complete performance collapse in @yield@.
    274270Tokio has a similar performance collapse after 16 processors, and therefore, its special @yield@ handling is probably related to a Go-like scheduler problem and/or a \CFA idle-sleep problem.
    275271(I did not dig through the Rust code to ascertain the exact reason for the collapse.)
    276 \end{itemize}
     272Note that since there is no communication among \ats, locality problems are much less likely than for the cycle benchmark.
     273This lack of communication is probably why the plateaus due to topology are not present.
     274
     275Looking next at the right column on Intel, Figures~\ref{fig:yield:jax:low:ops} and \ref{fig:yield:jax:low:ns} show the results for 1 \at for each \proc.
     276As for @cycle@, \CFA's cost of idle sleep comes into play in a very significant way in Figure~\ref{fig:yield:jax:low:ns}, where the scaling is not flat.
     277This result is to be expected since fewer \ats mean \procs are more likely to run out of work.
     278On the other hand, when only running 1 \at per \proc, libfibre optimizes further and forgoes the context switch entirely.
     279This results in libfibre outperforming other runtimes, even more, achieving 8 times more throughput than for @cycle@.
     280Finally, Go and Tokio's performance collapse is still the same with fewer \ats.
     281The only exception is Tokio running on 24 \procs, deepening the mystery of its yielding mechanism further.
    277282
    278283\begin{figure}
     
    302307                \label{fig:yield:nasus:low:ns}
    303308        }
    304         \caption[Yield Benchmark on AMD]{Yield Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, using 1 \ats per \proc. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
     309        \caption[Yield Benchmark on AMD]{Yield Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count.
     310        For throughput, higher is better, for scalability, lower is better.
     311        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
    305312        \label{fig:yield:nasus}
    306313\end{figure}
    307314
    308 For the AMD architecture, Figure~\ref{fig:yield:nasus}, the results show the same story as on the Intel, with slightly increased variations.
    309 Also, some transition points on the X-axis differ because of the architectures, like at 16 versus 24 processors.
    310 
    311 It is difficult to draw conclusions for this benchmark when runtime system treat @yield@ so differently.
     315Looking now at the results for the AMD architecture, Figure~\ref{fig:yield:nasus}, the results again show a story that is overall similar to the results on the Intel, with increased variation and some differences in the details.
     316Note that the maximum of the Y-axis on Intel and AMD differ less in @yield@ than @cycle@.
     317Looking at the left column first, Figures~\ref{fig:yield:nasus:ops} and \ref{fig:yield:nasus:ns}, \CFA achieves very similar throughput and scaling.
     318Libfibre still outpaces all other runtimes, but it encounters a performance hit at 64 \procs.
     319This anomaly suggests some amount of communication between the \procs that the Intel machine is able to mask where the AMD is not once hyperthreading is needed.
     320Go and Tokio still display the same performance collapse as on Intel.
     321Looking next at the right column on AMD, Figures~\ref{fig:yield:nasus:low:ops} and \ref{fig:yield:nasus:low:ns}, all runtime systems effectively behave the same as they did on the Intel machine.
     322At the high \ats count, the only difference is Libfibre's scaling and this difference disappears on the right column.
     323This behaviour suggests whatever communication issue it encountered on the left is completely circumvented on the right.
     324
     325It is difficult to draw conclusions for this benchmark when runtime systems treat @yield@ so differently.
    312326The win for \CFA is its consistency between the cycle and yield benchmarks making it simpler for programmers to use and understand, \ie the \CFA semantics match with programmer intuition.
    313327
     
    315329\section{Churn}
    316330
    317 The Cycle and Yield benchmark represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application.
    318 In these benchmarks, \ats can be easily partitioned over the different \procs upfront and none of the \ats communicate with each other.
    319 
    320 The Churn benchmark represents more chaotic executions, where there is more communication among \ats but no relationship between the last \proc on which a \at ran and blocked and the \proc that subsequently unblocks it.
     331The Cycle and Yield benchmarks represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application.
     332In these benchmarks \ats can be easily partitioned over the different \procs upfront and none of the \ats communicate with each other.
     333
     334The Churn benchmark represents more chaotic executions, where there is more communication among \ats but no relationship between the last \proc on which a \at ran and blocked, and the \proc that subsequently unblocks it.
    321335With processor-specific ready-queues, when a \at is unblocked by a different \proc that means the unblocking \proc must either ``steal'' the \at from another processor or find it on a remote queue.
    322336This dequeuing results in either contention on the remote queue and/or \glspl{rmr} on the \at data structure.
    323 Hence, this benchmark has performance dominated by the cache traffic as \proc are constantly accessing the each other's data.
    324 In either case, this benchmark aims to measure how well a scheduler handles these cases, since both cases can lead to performance degradation if not handled correctly.
     337Hence, this benchmark has performance dominated by the cache traffic as \procs are constantly accessing each other's data.
     338In either case, this benchmark aims to measure how well a scheduler handles these cases since both cases can lead to performance degradation if not handled correctly.
    325339
    326340This benchmark uses a fixed-size array of counting semaphores.
    327 Each \at picks a random semaphore, @V@s it to unblock any waiting \at, and then @P@s (maybe blocks) the \ats on the semaphore.
     341Each \at picks a random semaphore, @V@s it to unblock any waiting \at, and then @P@s (maybe blocks) the \at on the semaphore.
    328342This creates a flow where \ats push each other out of the semaphores before being pushed out themselves.
    329 For this benchmark to work, the number of \ats must be equal or greater than the number of semaphores plus the number of \procs;
    330 \eg if there are 10 semaphores and 5 \procs, but only 3 \ats, all 3 \ats can block (P) on a random semaphore and now there is no \ats to unblock (V) them.
    331 Note, the nature of these semaphores mean the counter can go beyond 1, which can lead to nonblocking calls to @P@.
     343For this benchmark to work, the number of \ats must be equal to or greater than the number of semaphores plus the number of \procs;
     344\eg if there are 10 semaphores and 5 \procs, but only 3 \ats, all 3 \ats can block (P) on a random semaphore and now there are no \ats to unblock (V) them.
     345Note that the nature of these semaphores means the counter can go beyond 1, which can lead to nonblocking calls to @P@.
    332346Figure~\ref{fig:churn:code} shows pseudo code for this benchmark, where the @yield@ is replaced by @V@ and @P@.
    333347
     
    346360}
    347361\end{cfa}
    348 \caption[Churn Benchmark : Pseudo Code]{Churn Benchmark : Pseudo Code}
     362\caption[Churn Benchmark: Pseudo Code]{Churn Benchmark: Pseudo Code}
    349363\label{fig:churn:code}
    350364%\end{figure}
     
    364378        }
    365379
    366         \subfloat[][Latency, 100 \ats per \proc]{
     380        \subfloat[][Scalability, 100 \ats per \proc]{
    367381                \resizebox{0.5\linewidth}{!}{
    368382                        \input{result.churn.jax.ns.pstex_t}
     
    370384                \label{fig:churn:jax:ns}
    371385        }
    372         \subfloat[][Latency, 2 \ats per \proc]{
     386        \subfloat[][Scalability, 2 \ats per \proc]{
    373387                \resizebox{0.5\linewidth}{!}{
    374388                        \input{result.churn.low.jax.ns.pstex_t}
     
    376390                \label{fig:churn:jax:low:ns}
    377391        }
    378         \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
     392        \caption[Churn Benchmark on Intel]{Churn Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count.
     393        For throughput, higher is better, for scalability, lower is better.
     394        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
    379395        \label{fig:churn:jax}
    380396\end{figure}
     
    382398\subsection{Results}
    383399
    384 Figures~\ref{fig:churn:jax} and Figure~\ref{fig:churn:nasus} show the throughput on Intel and AMD respectively.
    385 
    386 The performance cost of crossing the cache boundaries is still visible at the same \proc count.
    387 
    388 Scalability is notably worst than the previous benchmarks since there is inherently more communication between processors.
    389 Indeed, once the number of \glspl{hthrd} goes beyond a single socket, performance ceases to improve.
     400Figures~\ref{fig:churn:jax} and Figure~\ref{fig:churn:nasus} show the results for the churn experiment on Intel and AMD, respectively.
     401Looking at the left column on Intel, Figures~\ref{fig:churn:jax:ops} and \ref{fig:churn:jax:ns} show the results for 100 \ats for each \proc, and all runtimes obtain fairly similar throughput for most \proc counts.
     402\CFA does very well on a single \proc but quickly loses its advantage over the other runtimes.
     403As expected, it scales decently up to 48 \procs, drops from 48 to 72 \procs, and then plateaus.
     404Tokio achieves very similar performance to \CFA, with the starting boost, scaling decently until 48 \procs, drops from 48 to 72 \procs, and starts increasing again to 192 \procs.
     405Libfibre obtains effectively the same results as Tokio with slightly less scaling, \ie the scaling curve is the same but with slightly lower values.
     406Finally, Go gets the most peculiar results, scaling worst than other runtimes until 48 \procs.
     407At 72 \procs, the results of the Go runtime vary significantly, sometimes scaling sometimes plateauing.
     408However, beyond this point Go keeps this level of variation but does not scale further in any of the runs.
     409
     410Throughput and scalability are notably worst for all runtimes than the previous benchmarks since there is inherently more communication between processors.
     411Indeed, none of the runtimes reach 40 million operations per second while in the cycle benchmark all but libfibre reached 400 million operations per second.
     412Figures~\ref{fig:churn:jax:ns} and \ref{fig:churn:jax:low:ns} show that for all \proc counts, all runtimes produce poor scaling.
     413However, once the number of \glspl{hthrd} goes beyond a single socket, at 48 \procs, scaling goes from bad to worst and performance completely ceases to improve.
     414At this point, the benchmark is dominated by inter-socket communication costs for all runtimes.
     415
    390416An interesting aspect to note here is that the runtimes differ in how they handle this situation.
    391 Indeed, when a \proc unparks a \at that was last run on a different \proc, the \at could be appended to the ready-queue local \proc or to the ready-queue of the remote \proc, which previously ran the \at.
    392 \CFA, Tokio and Go all use the approach of unparking to the local \proc while Libfibre unparks to the remote \proc.
    393 In this particular benchmark, the inherent chaos of the benchmark in addition to small memory footprint means neither approach wins over the other.
     417Indeed, when a \proc unparks a \at that was last run on a different \proc, the \at could be appended to the ready queue of the local \proc or to the ready queue of the remote \proc, which previously ran the \at.
     418\CFA, Tokio and Go all use the approach of \glslink{atsched}{unparking} to the local \proc, while Libfibre unparks to the remote \proc.
     419In this particular benchmark, the inherent chaos of the benchmark, in addition to the small memory footprint, means neither approach wins over the other.
     420
     421Looking next at the right column on Intel, Figures~\ref{fig:churn:jax:low:ops} and \ref{fig:churn:jax:low:ns} show the results for 1 \at for each \proc, and many of the differences between the runtimes disappear.
     422\CFA outperforms other runtimes by a minuscule margin.
     423Libfibre follows very closely behind with basically the same performance and scaling.
     424Tokio maintains effectively the same curve shapes as \CFA and libfibre, but it incurs extra costs for all \proc counts.
     425While Go maintains overall similar results to the others, it again encounters significant variation at high \proc counts.
     426Inexplicably resulting in super-linear scaling for some runs, \ie the scalability curves display a negative slope.
     427
     428Interestingly, unlike the cycle benchmark, running with fewer \ats does not produce drastically different results.
     429In fact, the overall throughput stays almost exactly the same on the left and right columns.
    394430
    395431\begin{figure}
     
    407443        }
    408444
    409         \subfloat[][Latency, 100 \ats per \proc]{
     445        \subfloat[][Scalability, 100 \ats per \proc]{
    410446                \resizebox{0.5\linewidth}{!}{
    411447                        \input{result.churn.nasus.ns.pstex_t}
     
    413449                \label{fig:churn:nasus:ns}
    414450        }
    415         \subfloat[][Latency, 2 \ats per \proc]{
     451        \subfloat[][Scalability, 2 \ats per \proc]{
    416452                \resizebox{0.5\linewidth}{!}{
    417453                        \input{result.churn.low.nasus.ns.pstex_t}
     
    419455                \label{fig:churn:nasus:low:ns}
    420456        }
    421         \caption[Churn Benchmark on AMD]{\centering Churn Benchmark on AMD\smallskip\newline Throughput and latency of the Churn on the benchmark on the AMD machine.
    422         For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
     457        \caption[Churn Benchmark on AMD]{Churn Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count.
     458        For throughput, higher is better, for scalability, lower is better.
     459        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
    423460        \label{fig:churn:nasus}
    424461\end{figure}
    425462
    426 Like for the cycle benchmark, here all runtimes achieve fairly similar performance.
    427 Performance improves as long as all \procs fit on a single socket.
    428 Beyond that performance starts to suffer from increased caching costs.
    429 
    430 Indeed on Figures~\ref{fig:churn:jax:ops} and \ref{fig:churn:jax:ns} show that with 1 and 100 \ats per \proc, \CFA, libfibre, Go and Tokio achieve effectively equivalent performance for most \proc count.
    431 
    432 However, Figure~\ref{fig:churn:nasus} again shows a somewhat different story on AMD.
    433 While \CFA, libfibre, and Tokio achieve effectively equivalent performance for most \proc count, Go starts with better scaling at very low \proc counts but then performance quickly plateaus, resulting in worse performance at higher \proc counts.
    434 This performance difference is visible at both high and low \at counts.
    435 
    436 One possible explanation for this difference is that since Go has very few available concurrent primitives, a channel was used instead of a semaphore.
    437 On paper a semaphore can be replaced by a channel and with zero-sized objects passed along equivalent performance could be expected.
    438 However, in practice there can be implementation difference between the two.
    439 This is especially true if the semaphore count can get somewhat high.
    440 Note that this replacement is also made in the cycle benchmark, however in that context it did not seem to have a notable impact.
    441 
    442 As second possible explanation is that Go may sometimes use the heap when allocating variables based on the result of escape analysis of the code.
    443 It is possible that variables that should be placed on the stack are placed on the heap.
    444 This could cause extra pointer chasing in the benchmark, heightening locality effects.
    445 Depending on how the heap is structure, this could also lead to false sharing.
    446 
    447 The objective of this benchmark is to demonstrate that unparking \ats from remote \procs do not cause too much contention on the local queues.
    448 Indeed, the fact all runtimes achieve some scaling at lower \proc count demonstrate that migrations do not need to be serialized.
    449 Again these result demonstrate \CFA achieves satisfactory performance.
     463
     464Looking now at the results for the AMD architecture, Figure~\ref{fig:churn:nasus}, the results show a somewhat different story.
     465Looking at the left column first, Figures~\ref{fig:churn:nasus:ops} and \ref{fig:churn:nasus:ns}, \CFA, Libfibre and Tokio all produce decent scalability.
     466\CFA suffers particularly from larger variations at higher \proc counts, but largely outperforms the other runtimes.
     467Go still produces intriguing results in this case and even more intriguingly, the results have fairly low variation.
     468
     469One possible explanation for Go's difference is that it has very few available concurrent primitives, so a channel is substituted for a semaphore.
     470On paper, a semaphore can be replaced by a channel, and with zero-sized objects passed through the channel, equivalent performance could be expected.
     471However, in practice, there are implementation differences between the two, \eg if the semaphore count can get somewhat high so objects accumulate in the channel.
     472Note that this substitution is also made in the cycle benchmark;
     473however, in that context, it did not have a notable impact.
     474
     475A second possible explanation is that Go may use the heap when allocating variables based on the result of the escape analysis of the code.
     476It is possible for variables that could be placed on the stack to instead be placed on the heap.
     477This placement could cause extra pointer chasing in the benchmark, heightening locality effects.
     478Depending on how the heap is structured, this could also lead to false sharing.
     479I did not investigate what causes these unusual results.
     480
     481Looking next at the right column, Figures~\ref{fig:churn:nasus:low:ops} and \ref{fig:churn:nasus:low:ns}, as for Intel, all runtimes obtain overall similar throughput between the left and right column.
     482\CFA, Libfibre and Tokio all have very close results.
     483Go still suffers from poor scalability but is now unusual in a different way.
     484While it obtains effectively constant performance regardless of \proc count, this ``sequential'' performance is higher than the other runtimes for low \proc count.
     485Up to 32 \procs, after which the other runtimes manage to outscale Go.
     486
     487In conclusion, the objective of this benchmark is to demonstrate that \glslink{atsched}{unparking} \ats from remote \procs does not cause too much contention on the local queues.
     488Indeed, the fact that most runtimes achieve some scaling between various \proc counts demonstrates migrations do not need to be serialized.
     489Again these results demonstrate that \CFA achieves satisfactory performance compared to the other runtimes.
    450490
    451491\section{Locality}
     492
     493As mentioned in the churn benchmark, when \glslink{atsched}{unparking} a \at, it is possible to either \unpark to the local or remote ready-queue.\footnote{
     494It is also possible to \unpark to a third unrelated ready-queue, but without additional knowledge about the situation, it is likely to degrade performance.}
     495The locality experiment includes two variations of the churn benchmark, where a data array is added.
     496In both variations, before @V@ing the semaphore, each \at calls a @work@ function which increments random cells inside the data array.
     497In the noshare variation, the array is not passed on and each thread continuously accesses its private array.
     498In the share variation, the array is passed to another thread via the semaphore's shadow queue (each blocking thread can save a word of user data in its blocking node), transferring ownership of the array to the woken thread.
     499Figure~\ref{fig:locality:code} shows the pseudo code for this benchmark.
     500
     501The objective here is to highlight the different decisions made by the runtime when \glslink{atsched}{unparking}.
     502Since each thread unparks a random semaphore, it means that it is unlikely that a \at is unparked from the last \proc it ran on.
     503In the noshare variation, \glslink{atsched}{unparking} the \at on the local \proc is an appropriate choice since the data was last modified on that \proc.
     504In the shared variation, \glslink{atsched}{unparking} the \at on a remote \proc is an appropriate choice.
     505
     506The expectation for this benchmark is to see a performance inversion, where runtimes fare notably better in the variation which matches their \glslink{atsched}{unparking} policy.
     507This decision should lead to \CFA, Go and Tokio achieving better performance in the share variation while libfibre achieves better performance in noshare.
     508Indeed, \CFA, Go and Tokio have the default policy of \glslink{atsched}{unparking} \ats on the local \proc, whereas libfibre has the default policy of \glslink{atsched}{unparking} \ats wherever they last ran.
    452509
    453510\begin{figure}
     
    493550\end{lrbox}
    494551
    495 \subfloat[Thread$_1$]{\label{f:CFibonacci}\usebox\myboxA}
     552\subfloat[Noshare]{\label{fig:locality:code:T1}\usebox\myboxA}
    496553\hspace{3pt}
    497554\vrule
    498555\hspace{3pt}
    499 \subfloat[Thread$_2$]{\label{f:CFAFibonacciGen}\usebox\myboxB}
    500 
    501 \caption[Locality Benchmark : Pseudo Code]{Locality Benchmark : Pseudo Code}
     556\subfloat[Share]{\label{fig:locality:code:T2}\usebox\myboxB}
     557
     558\caption[Locality Benchmark: Pseudo Code]{Locality Benchmark: Pseudo Code}
    502559\label{fig:locality:code}
    503560\end{figure}
    504561
    505 As mentioned in the churn benchmark, when unparking a \at, it is possible to either unpark to the local or remote ready-queue.
    506 \footnote{It is also possible to unpark to a third unrelated ready-queue, but without additional knowledge about the situation, there is little to suggest this would not degrade performance.}
    507 The locality experiment includes two variations of the churn benchmark, where an array of data is added.
    508 In both variations, before @V@ing the semaphore, each \at increment random cells inside the array.
    509 The @share@ variation then passes the array to the shadow-queue of the semaphore, transferring ownership of the array to the woken thread.
    510 In the @noshare@ variation the array is not passed on and each thread continuously accesses its private array.
    511 
    512 The objective here is to highlight the different decision made by the runtime when unparking.
    513 Since each thread unparks a random semaphore, it means that it is unlikely that a \at will be unparked from the last \proc it ran on.
    514 In the @share@ version, this means that unparking the \at on the local \proc is appropriate since the data was last modified on that \proc.
    515 In the @noshare@ version, the unparking the \at on the remote \proc is the appropriate approach.
    516 
    517 The expectation for this benchmark is to see a performance inversion, where runtimes will fare notably better in the variation which matches their unparking policy.
    518 This should lead to \CFA, Go and Tokio achieving better performance in @share@ while libfibre achieves better performance in @noshare@.
    519 Indeed, \CFA, Go and Tokio have the default policy of unpark \ats on the local \proc, where as libfibre has the default policy of unparks \ats wherever they last ran.
    520 
    521562\subsection{Results}
    522563
     564Figures~\ref{fig:locality:jax} and \ref{fig:locality:nasus} show the results for the locality experiment on Intel and AMD, respectively.
     565In both cases, the graphs on the left column show the results for the share variation and the graphs on the right column show the results for the noshare.
     566Looking at the left column on Intel, Figures~\ref{fig:locality:jax:share:ops} and \ref{fig:locality:jax:share:ns} show the results for the share variation.
     567\CFA and Tokio slightly outperform libfibre, as expected, based on their \ats placement approach.
     568\CFA and Tokio both \unpark locally and do not suffer cache misses on the transferred array.
     569Libfibre, on the other hand, unparks remotely, and as such the unparked \at is likely to miss on the shared data.
     570Go trails behind in this experiment, presumably for the same reasons that were observable in the churn benchmark.
     571Otherwise, the results are similar to the churn benchmark, with lower throughput due to the array processing.
     572As for most previous results, all runtimes suffer a performance hit after 48 \procs, which is the socket boundary, and climb again from 96 to 192 \procs.
     573
    523574\begin{figure}
    524575        \subfloat[][Throughput share]{
     
    547598                \label{fig:locality:jax:noshare:ns}
    548599        }
    549         \caption[Locality Benchmark on Intel]{Locality Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
     600        \caption[Locality Benchmark on Intel]{Locality Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count.
     601        For throughput, higher is better, for scalability, lower is better.
     602        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
    550603        \label{fig:locality:jax}
    551604\end{figure}
     605
    552606\begin{figure}
    553607        \subfloat[][Throughput share]{
     
    576630                \label{fig:locality:nasus:noshare:ns}
    577631        }
    578         \caption[Locality Benchmark on AMD]{Locality Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
     632        \caption[Locality Benchmark on AMD]{Locality Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count.
     633        For throughput, higher is better, for scalability, lower is better.
     634        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
    579635        \label{fig:locality:nasus}
    580636\end{figure}
    581637
    582 Figures~\ref{fig:locality:jax} and \ref{fig:locality:nasus} shows the results on Intel and AMD respectively.
    583 In both cases, the graphs on the left column show the results for the @share@ variation and the graphs on the right column show the results for the @noshare@.
    584 
    585 On Intel, Figure~\ref{fig:locality:jax} shows Go trailing behind the 3 other runtimes.
    586 On the left of the figure showing the results for the shared variation, where \CFA and Tokio slightly outperform libfibre as expected.
    587 And correspondingly on the right, we see the expected performance inversion where libfibre now outperforms \CFA and Tokio.
    588 Otherwise the results are similar to the churn benchmark, with lower throughput due to the array processing.
    589 Presumably the reason why Go trails behind are the same as in Figure~\ref{fig:churn:nasus}.
    590 
    591 Figure~\ref{fig:locality:nasus} shows the same experiment on AMD.
    592 \todo{why is cfa slower?}
    593 Again, we see the same story, where Tokio and libfibre swap places and Go trails behind.
     638Looking at the right column on Intel, Figures~\ref{fig:locality:jax:noshare:ops} and \ref{fig:locality:jax:noshare:ns} show the results for the noshare variation.
     639The graphs show the expected performance inversion where libfibre now outperforms \CFA and Tokio.
     640Indeed, in this case, unparking remotely means the unparked \at is less likely to suffer a cache miss on the array, which leaves the \at data structure and the remote queue as the only source of likely cache misses.
     641Results show both are amortized fairly well in this case.
     642\CFA and Tokio both \unpark locally and as a result, suffer a marginal performance degradation from the cache miss on the array.
     643
     644Looking at the results for the AMD architecture, Figure~\ref{fig:locality:nasus}, shows results similar to the Intel.
     645Again the overall performance is higher and slightly more variation is visible.
     646Looking at the left column first, Figures~\ref{fig:locality:nasus:share:ops} and \ref{fig:locality:nasus:share:ns}, \CFA and Tokio still outperform libfibre, this time more significantly.
     647This advantage is expected from the AMD server with its smaller and narrower caches that magnify the costs of processing the array.
     648Go still has the same poor performance as on Intel.
     649
     650Finally looking at the right column, Figures~\ref{fig:locality:nasus:noshare:ops} and \ref{fig:locality:nasus:noshare:ns}, like on Intel, the same performance inversion is present between libfibre and \CFA/Tokio.
     651Go still has the same poor performance.
     652
     653Overall, this benchmark mostly demonstrates the two options available when \glslink{atsched}{unparking} a \at.
     654Depending on the workload, either of these options can be the appropriate one.
     655Since it is prohibitively difficult to dynamically detect which approach is appropriate, all runtimes much choose one of the two and live with the consequences.
     656
     657Once again, these experiments demonstrate that \CFA achieves equivalent performance to the other runtimes, in this case matching the faster Tokio rather than Go, which is trailing behind.
    594658
    595659\section{Transfer}
    596660The last benchmark is more of an experiment than a benchmark.
    597661It tests the behaviour of the schedulers for a misbehaved workload.
    598 In this workload, one of the \at is selected at random to be the leader.
     662In this workload, one \at is selected at random to be the leader.
    599663The leader then spins in a tight loop until it has observed that all other \ats have acknowledged its leadership.
    600664The leader \at then picks a new \at to be the next leader and the cycle repeats.
    601 The benchmark comes in two flavours for the non-leader \ats:
     665The benchmark comes in two variations for the non-leader \ats:
    602666once they acknowledged the leader, they either block on a semaphore or spin yielding.
    603 
    604 The experiment is designed to evaluate the short-term load-balancing of a scheduler.
    605 Indeed, schedulers where the runnable \ats are partitioned on the \procs may need to balance the \ats for this experiment to terminate.
    606 This problem occurs because the spinning \at is effectively preventing the \proc from running any other \at.
    607 In the semaphore flavour, the number of runnable \ats eventually dwindles down to only the leader.
    608 This scenario is a simpler case to handle for schedulers since \procs eventually run out of work.
    609 In the yielding flavour, the number of runnable \ats stays constant.
    610 This scenario is a harder case to handle because corrective measures must be taken even when work is available.
    611 Note, runtime systems with preemption circumvent this problem by forcing the spinner to yield.
    612 
    613 In both flavours, the experiment effectively measures how long it takes for all \ats to run once after a given synchronization point.
    614 In an ideal scenario where the scheduler is strictly FIFO, every thread would run once after the synchronization and therefore the delay between leaders would be given by:
    615 $ \frac{CSL + SL}{NP - 1}$, where $CSL$ is the context switch latency, $SL$ is the cost for enqueueing and dequeuing a \at and $NP$ is the number of \procs.
    616 However, if the scheduler allows \ats to run many times before other \ats are able to run once, this delay will increase.
    617 The semaphore version is an approximation of the strictly FIFO scheduling, where none of the \ats \emph{attempt} to run more than once.
    618 The benchmark effectively provides the fairness guarantee in this case.
    619 In the yielding version however, the benchmark provides no such guarantee, which means the scheduler has full responsibility and any unfairness will be measurable.
    620 
    621 While this is a fairly artificial scenario, it requires only a few simple pieces.
    622 The yielding version of this simply creates a scenario where a \at runs uninterrupted in a saturated system, and starvation has an easily measured impact.
    623 However, \emph{any} \at that runs uninterrupted for a significant period of time in a saturated system could lead to this kind of starvation.
     667Figure~\ref{fig:transfer:code} shows pseudo code for this benchmark.
    624668
    625669\begin{figure}
     
    641685        // pick next leader
    642686        leader := threads[ prng() % len(threads) ]
    643         // wake every one
     687        // wake everyone
    644688        if ! exhaust {
    645689                for t in threads {
     
    660704}
    661705\end{cfa}
    662 \caption[Transfer Benchmark : Pseudo Code]{Transfer Benchmark : Pseudo Code}
     706\caption[Transfer Benchmark: Pseudo Code]{Transfer Benchmark: Pseudo Code}
    663707\label{fig:transfer:code}
    664708\end{figure}
    665709
     710The experiment is designed to evaluate the short-term load balancing of a scheduler.
     711Indeed, schedulers where the runnable \ats are partitioned on the \procs may need to balance the \ats for this experiment to terminate.
     712This problem occurs because the spinning \at is effectively preventing the \proc from running any other \at.
     713In the semaphore variation, the number of runnable \ats eventually dwindles to only the leader.
     714This scenario is a simpler case to handle for schedulers since \procs eventually run out of work.
     715In the yielding variation, the number of runnable \ats stays constant.
     716This scenario is a harder case to handle because corrective measures must be taken even when work is available.
     717Note that runtimes with preemption circumvent this problem by forcing the spinner to yield.
     718In \CFA preemption was disabled as it only obfuscates the results.
     719I am not aware of a method to disable preemption in Go.
     720
     721In both variations, the experiment effectively measures how long it takes for all \ats to run once after a given synchronization point.
     722In an ideal scenario where the scheduler is strictly FIFO, every thread would run once after the synchronization and therefore the delay between leaders would be given by, $(CSL + SL) / (NP - 1)$,
     723where $CSL$ is the context-switch latency, $SL$ is the cost for enqueueing and dequeuing a \at, and $NP$ is the number of \procs.
     724However, if the scheduler allows \ats to run many times before other \ats can run once, this delay increases.
     725The semaphore version is an approximation of strictly FIFO scheduling, where none of the \ats \emph{attempt} to run more than once.
     726The benchmark effectively provides the fairness guarantee in this case.
     727In the yielding version, however, the benchmark provides no such guarantee, which means the scheduler has full responsibility and any unfairness is measurable.
     728
     729While this is an artificial scenario, in real life it requires only a few simple pieces.
     730The yielding version simply creates a scenario where a \at runs uninterrupted in a saturated system and the starvation has an easily measured impact.
     731Hence, \emph{any} \at that runs uninterrupted for a significant time in a saturated system could lead to this kind of starvation.
     732
    666733\subsection{Results}
    667 \begin{figure}
     734
     735\begin{table}
     736\caption[Transfer Benchmark on Intel and AMD]{Transfer Benchmark on Intel and AMD\smallskip\newline Average measurement of how long it takes for all \ats to acknowledge the leader \at.
     737DNC stands for ``did not complete'', meaning that after 5 seconds of a new leader being decided, some \ats still had not acknowledged the new leader.}
     738\label{fig:transfer:res}
     739\setlength{\extrarowheight}{2pt}
     740\setlength{\tabcolsep}{5pt}
    668741\begin{centering}
    669 \begin{tabular}{r | c c c c | c c c c }
    670 Machine   &                     \multicolumn{4}{c |}{Intel}                &          \multicolumn{4}{c}{AMD}                    \\
    671 Variation & \multicolumn{2}{c}{Park} & \multicolumn{2}{c |}{Yield} & \multicolumn{2}{c}{Park} & \multicolumn{2}{c}{Yield} \\
     742\begin{tabular}{r | c | c | c | c | c | c | c | c}
     743Machine   &                     \multicolumn{4}{c |}{Intel}                &          \multicolumn{4}{c}{AMD}             \\
     744\cline{2-9}
     745Variation & \multicolumn{2}{c|}{Park} & \multicolumn{2}{c |}{Yield} & \multicolumn{2}{c|}{Park} & \multicolumn{2}{c}{Yield} \\
     746\cline{2-9}
    672747\procs    &      2      &      192   &      2      &      192      &      2      &      256   &      2      &      256    \\
    673748\hline
     
    678753\end{tabular}
    679754\end{centering}
    680 \caption[Transfer Benchmark on Intel and AMD]{Transfer Benchmark on Intel and AMD\smallskip\newline Average measurement of how long it takes for all \ats to acknowledge the leader \at. DNC stands for ``did not complete'', meaning that after 5 seconds of a new leader being decided, some \ats still had not acknowledged the new leader.}
    681 \label{fig:transfer:res}
    682 \end{figure}
    683 
    684 Figure~\ref{fig:transfer:res} shows the result for the transfer benchmark with 2 \procs and all \procs, where each experiment runs 100 \at per \proc.
     755\end{table}
     756
     757Table~\ref{fig:transfer:res} shows the result for the transfer benchmark with 2 \procs and all \procs on the computer, where each experiment runs 100 \ats per \proc.
    685758Note that the results here are only meaningful as a coarse measurement of fairness, beyond which small cost differences in the runtime and concurrent primitives begin to matter.
    686 As such, data points that are the on the same order of magnitude as each other should be basically considered equal.
    687 The takeaway of this experiment is the presence of very large differences.
    688 The semaphore variation is denoted ``Park'', where the number of \ats dwindles down as the new leader is acknowledged.
     759As such, data points within the same order of magnitude are considered equal.
     760That is, the takeaway of this experiment is the presence of very large differences.
     761The semaphore variation is denoted ``Park'', where the number of \ats dwindles as the new leader is acknowledged.
    689762The yielding variation is denoted ``Yield''.
    690 The experiment was only run for the extremes of the number of cores since the scaling per core behaves like previous experiments.
    691 This experiments clearly demonstrate that while the other runtimes achieve similar performance in previous benchmarks, here \CFA achieves significantly better fairness.
    692 The semaphore variation serves as a control group, where all runtimes are expected to transfer leadership fairly quickly.
    693 Since \ats block after acknowledging the leader, this experiment effectively measures how quickly \procs can steal \ats from the \proc running leader.
    694 Figure~\ref{fig:transfer:res} shows that while Go and Tokio are slower, all runtime achieve decent latency.
     763The experiment is only run for a few and many \procs since scaling is not the focus of this experiment.
     764
     765The first two columns show the results for the semaphore variation on Intel.
     766While there are some differences in latencies, \CFA is consistently the fastest and Tokio the slowest, all runtimes achieve fairly close results.
     767Again, this experiment is meant to highlight major differences so latencies within $10\times$ of each other are considered equal.
     768
     769Looking at the next two columns, the results for the yield variation on Intel, the story is very different.
     770\CFA achieves better latencies, presumably due to no synchronization with the yield.
     771Go does complete the experiment, but with drastically higher latency:
     772latency at 2 \procs is $350\times$ higher than \CFA and $70\times$ higher at 192 \procs.
     773This difference is because Go has a classic work-stealing scheduler, but it adds coarse-grain preemption
     774, which interrupts the spinning leader after a period.
     775Neither Libfibre nor Tokio complete the experiment.
     776Both runtimes also use classical work-stealing scheduling without preemption, and therefore, none of the work queues are ever emptied so no load balancing occurs.
     777
     778Looking now at the results for the AMD architecture, the results show effectively the same story.
     779The first two columns show all runtime obtaining results well within $10\times$ of each other.
     780The next two columns again show \CFA producing low latencies, while Go still has notably higher latency but the difference is less drastic on 2 \procs, where it produces a $15\times$ difference as opposed to a $100\times$ difference on 256 \procs.
     781Neither Libfibre nor Tokio complete the experiment.
     782
     783This experiment clearly demonstrates that \CFA achieves significantly better fairness.
     784The semaphore variation serves as a control, where all runtimes are expected to transfer leadership fairly quickly.
     785Since \ats block after acknowledging the leader, this experiment effectively measures how quickly \procs can steal \ats from the \proc running the leader.
     786Table~\ref{fig:transfer:res} shows that while Go and Tokio are slower using the semaphore, all runtimes achieve decent latency.
     787
    695788However, the yielding variation shows an entirely different picture.
    696 Since libfibre and Tokio have a traditional work-stealing scheduler, \procs that have \ats on their local queues will never steal from other \procs.
    697 The result is that the experiment simply does not complete for these runtime.
    698 Without \procs stealing from the \proc running the leader, the experiment will simply never terminate.
     789Since libfibre and Tokio have a traditional work-stealing scheduler, \procs that have \ats on their local queues never steal from other \procs.
     790The result is that the experiment simply does not complete for these runtimes.
     791Without \procs stealing from the \proc running the leader, the experiment cannot terminate.
    699792Go manages to complete the experiment because it adds preemption on top of classic work-stealing.
    700 However, since preemption is fairly costly it achieves significantly worst performance.
     793However, since preemption is fairly infrequent, it achieves significantly worst performance.
    701794In contrast, \CFA achieves equivalent performance in both variations, demonstrating very good fairness.
    702 Interestingly \CFA achieves better delays in the yielding version than the semaphore version, however, that is likely due to fairness being equivalent but removing the cost of the semaphores and idle-sleep.
     795Interestingly \CFA achieves better delays in the yielding version than the semaphore version, however, that is likely due to fairness being equivalent but removing the cost of the semaphores and idle sleep.
  • doc/theses/thierry_delisle_PhD/thesis/text/existing.tex

    rebf8ca5 r23a08aa0  
    55
    66In general, \emph{selecting} a scheduling algorithm depends on how much information is available to the scheduler.
    7 Workloads that are well-known, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms.
     7Workloads that are well known, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms.
    88A secondary aspect is how much information can be gathered versus how much information must be given as part of the scheduler input.
    99This information adds to the spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limited information.
    10 Note, this description includes both information about each requests, \eg time to complete or resources needed, and information about the relationships among request, \eg whether or not some request must be completed before another request starts.
    11 
    12 Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling, since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time.
     10Note, this description includes both information about each request, \eg time to complete or resources needed, and information about the relationships among requests, \eg whether some requests must be completed before another request starts.
     11
     12Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods.
    1313When a faster pace is needed and changes are much more frequent gathering information on workloads, up-front or live, can become much more limiting and more general schedulers are needed.
    1414
    1515\section{Naming Convention}
    16 Scheduling has been studied by various communities concentrating on different incarnation of the same problems.
    17 As a result, there are no standard naming conventions for scheduling that is respected across these communities.
     16Scheduling has been studied by various communities concentrating on different incarnations of the same problems.
     17As a result, there are no standard naming conventions for scheduling that are respected across these communities.
    1818This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats.
    1919
    2020\section{Static Scheduling}
    21 \newterm{Static schedulers} require \ats dependencies and costs be explicitly and exhaustively specified prior to scheduling.
     21\newterm{Static schedulers} require \ats dependencies and costs to be explicitly and exhaustively specified prior to scheduling.
    2222The scheduler then processes this input ahead of time and produces a \newterm{schedule} the system follows during execution.
    2323This approach is popular in real-time systems since the need for strong guarantees justifies the cost of determining and supplying this information.
    24 In general, static schedulers are less relevant to this project because they require input from the programmers that the programming language does not have as part of its concurrency semantic.
     24In general, static schedulers are less relevant to this project because they require input from the programmers that the \CFA programming language does not have as part of its concurrency semantics.
    2525Specifying this information explicitly adds a significant burden to the programmer and reduces flexibility.
    2626For this reason, the \CFA scheduler does not require this information.
    2727
    2828\section{Dynamic Scheduling}
    29 \newterm{Dynamic schedulers} determine \ats dependencies and costs during scheduling, if at all.
    30 Hence, unlike static scheduling, \ats dependencies are conditional and detected at runtime.
    31 This detection takes the form of observing new \ats(s) in the system and determining dependencies from their behaviour, including suspending or halting a \ats that dynamically detects unfulfilled dependencies.
    32 Furthermore, each \ats has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled.
     29\newterm{Dynamic schedulers} determine \at dependencies and costs during scheduling, if at all.
     30Hence, unlike static scheduling, \at dependencies are conditional and detected at runtime.
     31This detection takes the form of observing new \ats in the system and determining dependencies from their behaviour, including suspending or halting a \at that dynamically detects unfulfilled dependencies.
     32Furthermore, each \at has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled.
    3333As a consequence, the scheduler often has an incomplete view of the system, seeing only \ats with no pending dependencies.
    3434
    3535\subsection{Explicitly Informed Dynamic Schedulers}
    36 While dynamic schedulers may not have an exhaustive list of dependencies for a \ats, some information may be available about each \ats, \eg expected duration, required resources, relative importance, \etc.
    37 When available, a scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information}
    38 However, most programmers do not determine or even \emph{predict} this information;
    39 at best, the scheduler has only some imprecise information provided by the programmer, \eg, indicating a \ats takes approximately 3--7 seconds to complete, rather than exactly 5 seconds.
    40 Providing this kind of information is a significant programmer burden especially if the information does not scale with the number of \ats and their complexity.
    41 For example, providing an exhaustive list of files read by 5 \ats is an easier requirement then providing an exhaustive list of memory addresses accessed by 10,000 independent \ats.
     36While dynamic schedulers may not have an exhaustive list of dependencies for a \at, some information may be available about each \at, \eg expected duration, required resources, relative importance, \etc.
     37When available, a scheduler can then use this information to direct the scheduling decisions.
     38For example, when scheduling in a cloud computing context, \ats will commonly have extra information that was manually entered, \eg caps on compute time or \io usage.
     39However, in the context of user-level threading, most programmers do not determine or even \emph{predict} this information;
     40at best, the scheduler has only some imprecise information provided by the programmer, \eg, indicating a \at takes approximately 3--7 seconds to complete, rather than exactly 5 seconds.
     41Providing this kind of information is a significant programmer burden, especially if the information does not scale with the number of \ats and their complexity.
     42For example, providing an exhaustive list of files read by 5 \ats is an easier requirement than providing an exhaustive list of memory addresses accessed by 10,000 independent \ats.
    4243
    4344Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, explicitly informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentioning.
     
    4546\subsubsection{Priority Scheduling}
    4647Common information used by schedulers to direct their algorithm is priorities.
    47 Each \ats is given a priority and higher-priority \ats are preferred to lower-priority ones.
    48 The simplest priority scheduling algorithm is to require that every \ats have a distinct pre-established priority and always run the available \ats with the highest priority.
     48Each \at is given a priority, and higher-priority \ats are preferred to lower-priority ones.
     49The simplest priority scheduling algorithm is to require that every \at have a distinct pre-established priority and always run the available \ats with the highest priority.
    4950Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \ats.
    50 It can therefore be desirable for schedulers to support \ats with identical priorities and/or automatically setting and adjusting priorities for \ats.
     51It can therefore be desirable for schedulers to support \ats with identical priorities and/or automatically set and adjust priorities for \ats.
    5152Most common operating systems use some variant on priorities with overlaps and dynamic priority adjustments.
    5253For example, Microsoft Windows uses a pair of priorities~\cite{win:priority}, one specified by users out of ten possible options and one adjusted by the system.
    5354
    5455\subsection{Uninformed and Self-Informed Dynamic Schedulers}
    55 Several scheduling algorithms do not require programmers to provide additional information on each \ats, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
     56Several scheduling algorithms do not require programmers to provide additional information on each \at, and instead, make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
    5657
    5758
    5859\subsubsection{Feedback Scheduling}
    59 As mentioned, schedulers may also gather information about each \ats to direct their decisions.
     60As mentioned, schedulers may also gather information about each \at to direct their decisions.
    6061This design effectively moves the scheduler into the realm of \newterm{Control Theory}~\cite{wiki:controltheory}.
    6162This information gathering does not generally involve programmers, and as such, does not increase programmer burden the same way explicitly provided information may.
    62 However, some feedback schedulers do allow programmers to offer additional information on certain \ats, in order to direct scheduling decisions.
    63 The important distinction being whether or not the scheduler can function without this additional information.
     63However, some feedback schedulers do allow programmers to offer additional information on certain \ats, to direct scheduling decisions.
     64The important distinction is whether the scheduler can function without this additional information.
    6465
    6566
    6667\section{Work Stealing}\label{existing:workstealing}
    67 One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work stealing.
    68 This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \ats first, but allows the possibility for other workers to steal local \ats if they run out of \ats.
    69 \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has a queue of \ats and workers without \ats steal \ats from random workers\footnote{The Burton and Sleep algorithm had trees of \ats and steal only among neighbours.}.
    70 Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations.
     68One of the most popular scheduling algorithms in practice (see~\ref{existing:prod}) is work stealing.
     69This idea, introduced by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \ats first but allows the possibility for other workers to steal local \ats if they run out of \ats.
     70\cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each worker has a queue of \ats and workers without \ats steal \ats from random workers\footnote{The Burton and Sleep algorithm has trees of \ats and steals only among neighbours.}.
     71Blumofe and Leiserson also prove worst-case space and time requirements for well-structured computations.
    7172
    7273Many variations of this algorithm have been proposed over the years~\cite{DBLP:journals/ijpp/YangH18}, both optimizations of existing implementations and approaches that account for new metrics.
     
    7677In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead.
    7778The best performance generally means finding a middle ground between the two.
    78 Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained.
    79 
    80 \paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating \ats from one core to another can be .  \cite{DBLP:journals/tpds/SquillanteL93}
    81 
    82 \todo{The survey is not great on this subject}
    83 
    84 \paragraph{Complex Machine Architecture} Another aspect that has been examined is how well work stealing is applicable to different machine architectures.
     79Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse-grained.
     80
     81\paragraph{Task Placement} Another aspect of work stealing that has been studied extensively is the mapping between \at and \proc.
     82In its simplest form, work stealing assumes that all \procs are interchangeable and therefore the mapping between \at and \proc is not interesting.
     83However, in real-life architectures there are contexts where different \procs can have different characteristics, which makes some mapping more interesting than others.
     84A common example where this is statically true is architectures with \glsxtrshort{numa}.
     85In these cases, it can be relevant to change the scheduler to be cognizant of the topology~\cite{vikranth2013topology,min2011hierarchical}.
     86Another example is energy usage, where the scheduler is modified to optimize for energy efficiency in addition/instead of performance~\cite{ribic2014energy,torng2016asymmetry}.
     87
     88\paragraph{Complex Machine Architecture} Another aspect that has been examined is how applicable work stealing is to different machine architectures.
     89This is arguably strongly related to Task Placement but extends into more heterogeneous architectures.
     90As \CFA offers no particular support for heterogeneous architecture, this is also an area that is less relevant to this thesis.
     91Although it could be an interesting avenue for future work.
    8592
    8693\subsection{Theoretical Results}
    87 There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}.
     94There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of \glslink{atmig}{migration}~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}.
    8895\cite{DBLP:journals/jacm/BlellochGM99} examines the space bounds of work stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} shows that for under-loaded systems, the scheduler completes its computations in finite time, \ie is \newterm{stable}.
    89 Others show that work stealing is applicable to various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}.
     96Others show that work stealing applies to various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}.
    9097\cite{DBLP:conf/ipps/ColeR13} also studied how randomized work-stealing affects false sharing among \ats.
    9198
    92 However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph.
    93 It is unclear how well these distributions represent workloads in real world scenarios.
     99However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph.
     100It is unclear how well these distributions represent workloads in real-world scenarios.
    94101
    95102\section{Preemption}
    96103One last aspect of scheduling is preemption since many schedulers rely on it for some of their guarantees.
    97104Preemption is the idea of interrupting \ats that have been running too long, effectively injecting suspend points into the application.
    98 There are multiple techniques to achieve this effect but they all aim to guarantee that the suspend points in a \ats are never further apart than some fixed duration.
    99 While this helps schedulers guarantee that no \ats unfairly monopolizes a worker, preemption can effectively be added to any scheduler.
    100 Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it.
     105There are multiple techniques to achieve this effect, but they all aim to guarantee that the suspend points in a \at are never further apart than some fixed duration.
     106While this helps schedulers guarantee that no \ats unfairly monopolize a worker, preemption can effectively be added to any scheduler.
     107Therefore, the only interesting aspect of preemption for the design of scheduling is whether to require it.
    101108
    102109\section{Production Schedulers}\label{existing:prod}
     
    104111While these schedulers do not necessarily represent the most recent advances in scheduling, they are what is generally accessible to programmers.
    105112As such, I believe these schedulers are at least as relevant as those presented in published work.
    106 Schedulers that operate in kernel space and in user space are considered, as both can offer relevant insight for this project.
     113Both Schedulers that operate in kernel space and user space are considered, as both can offer relevant insight for this project.
    107114However, real-time schedulers are not considered, as these have constraints that are much stricter than what is needed for this project.
    108115
    109116\subsection{Operating System Schedulers}
    110 Operating System Schedulers tend to be fairly complex as they generally support some amount of real-time, aim to balance interactive and non-interactive \ats and support multiple users sharing hardware without requiring these users to cooperate.
     117Operating System Schedulers tend to be fairly complex as they generally support some amount of real time, aim to balance interactive and non-interactive \ats and support multiple users sharing hardware without requiring these users to cooperate.
    111118Here are more details on a few schedulers used in the common operating systems: Linux, FreeBSD, Microsoft Windows and Apple's OS X.
    112 The information is less complete for operating systems with closed source.
     119The information is less complete for closed source operating systems.
    113120
    114121\paragraph{Linux's CFS}
    115122The default scheduler used by Linux, the Completely Fair Scheduler~\cite{MAN:linux/cfs,MAN:linux/cfs2}, is a feedback scheduler based on CPU time.
    116123For each processor, it constructs a Red-Black tree of \ats waiting to run, ordering them by the amount of CPU time used.
    117 The \ats that has used the least CPU time is scheduled.
     124The \at that has used the least CPU time is scheduled.
    118125It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time used.
    119 The ordering of \ats is also affected by a group based notion of fairness, where \ats belonging to groups having used less CPU time are preferred to \ats belonging to groups having used more CPU time.
    120 Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the load, currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}.
    121 
    122 \cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly.
    123 The issues highlighted stem from Linux's need to support fairness across \ats \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \ats and the other with one thousand \ats, the user with a single \ats does not receive one thousandth of the CPU time.}, increasing the complexity.
    124 
    125 Linux also offers a FIFO scheduler, a real-time scheduler, which runs the highest-priority \ats, and a round-robin scheduler, which is an extension of the FIFO-scheduler that adds fixed time slices. \cite{MAN:linux/sched}
     126The ordering of \ats is also affected by a group-based notion of fairness, where \ats belonging to groups having used less CPU time are preferred to \ats belonging to groups having used more CPU time.
     127Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the \gls{load}, currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}.
     128
     129\cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each \proc, but the paper argues this aspect can be improved significantly.
     130The issues highlighted stem from Linux's need to support fairness across \ats \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \at and the other with one thousand \ats, the user with a single \at does not receive one-thousandth of the CPU time.}, increasing the complexity.
     131
     132Linux also offers a FIFO scheduler, a real-time scheduler, which runs the highest-priority \ats, and a round-robin scheduler, which is an extension of the FIFO scheduler that adds fixed time slices. \cite{MAN:linux/sched}
    126133
    127134\paragraph{FreeBSD}
    128135The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS.
    129136It uses different data structures and heuristics but also schedules according to some combination of CPU time used and niceness values.
    130 It also periodically balances the load of the system (according to a different heuristic), but uses a simpler work stealing approach.
     137It also periodically balances the load of the system (according to a different heuristic) but uses a simpler work stealing approach.
    131138
    132139\paragraph{Windows(OS)}
    133140Microsoft's Operating System's Scheduler~\cite{MAN:windows/scheduler} is a feedback scheduler with priorities.
    134141It supports 32 levels of priorities, some of which are reserved for real-time and privileged applications.
    135 It schedules \ats based on the highest priorities (lowest number) and how much CPU time each \ats has used.
     142It schedules \ats based on the highest priorities (lowest number) and how much CPU time each \at has used.
    136143The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests.
    137144
    138 In~\cite{russinovich2009windows}, Chapter 1 section ``Processes, Threads, and Jobs''\todo{Look up section number.} discusses the scheduling policy more in depth.
    139 Multicore scheduling is based on a combination of priorities and preferred \proc.
     145In~\cite{russinovich2009windows}, Chapter 1 section 2.3 ``Processes, Threads, and Jobs'' discusses the scheduling policy more in-depth.
     146Multicore scheduling is based on a combination of priorities and \proc preference.
    140147Each \at is assigned an initial processor using a round-robin policy, called the \at's \newterm{ideal} \proc.
    141148\Glspl{at} are distributed among the \procs according to their priority, preferring to match \ats to their ideal \proc and then to the last \proc they ran on.
    142 This approach is a variation of work stealing, where the stealing \proc restore the \at to its original \proc after running it, but mixed with priorities.
     149This approach is a variation of work stealing, where the stealing \proc restores the \at to its original \proc after running it, but mixed with priorities.
    143150
    144151\paragraph{Apple OS X}
     
    152159\end{displayquote}
    153160
    154 \todo{load balancing}
     161There is very little documentation on the internals of this scheduler.
     162However, the documentation does describe a feature set that is very similar to the Windows and Linux OS schedulers.
     163Presumably, this means that the internals are also fairly similar overall.
    155164
    156165\subsection{User-Level Schedulers}
    157 By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \ats have the same user, and therefore cooperation is both feasible and probable.
     166By comparison, user-level schedulers tend to be simpler, gather fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \ats have the same user, and therefore cooperation is both feasible and probable.
    158167
    159168\paragraph{Go}\label{GoSafePoint}
    160169Go's scheduler uses a randomized work-stealing algorithm that has a global run-queue (\emph{GRQ}) and each processor (\emph{P}) has both a fixed-size run-queue (\emph{LRQ}) and a high-priority next ``chair'' holding a single element~\cite{GITHUB:go,YTUBE:go}.
    161 Preemption is present, but only at safe-points,~\cite{go:safepoints} which are inserted detection code at various frequent access boundaries.
     170Preemption is present, but only at safe points,~\cite{go:safepoints} which are detection code inserted at various frequent access boundaries.
    162171
    163172The algorithm is as follows :
     
    175184Erlang is a functional language that supports concurrency in the form of processes: threads that share no data.
    176185It uses a kind of round-robin scheduler, with a mix of work sharing and stealing to achieve load balancing~\cite{:erlang}, where under-loaded workers steal from other workers, but overloaded workers also push work to other workers.
    177 This migration logic is directed by monitoring logic that evaluates the load a few times per seconds.
     186This \glslink{atmig}{migration} logic is directed by monitoring logic that evaluates the load a few times per second.
    178187
    179188\paragraph{Intel\textregistered ~Threading Building Blocks}
    180189\newterm{Thread Building Blocks} (TBB) is Intel's task parallelism \cite{wiki:taskparallel} framework.
    181 It runs \newterm{jobs}, which are uninterruptable \ats that must always run to completion, on a pool of worker threads.
     190It runs \newterm{jobs}, which are uninterruptible \ats that must always run to completion, on a pool of worker threads.
    182191TBB's scheduler is a variation of randomized work-stealing that also supports higher-priority graph-like dependencies~\cite{MAN:tbb/scheduler}.
    183 It schedules \ats as follows (where \textit{t} is the last \ats completed):
     192It schedules \ats as follows (where \textit{t} is the last \at completed):
    184193\begin{displayquote}
    185194        \begin{enumerate}
    186195                \item The task returned by \textit{t}@.execute()@
    187196                \item The successor of t if \textit{t} was its last completed predecessor.
    188                 \item A task popped from the end of the thread's own deque.
    189                 \item A task with affinity for the thread.
     197                \item A task popped from the end of the thread's own queue.
     198                \item A task with an affinity for the thread.
    190199                \item A task popped from approximately the beginning of the shared queue.
    191                 \item A task popped from the beginning of another randomly chosen thread's deque.
     200                \item A task popped from the beginning of another randomly chosen thread's queue.
    192201        \end{enumerate}
    193202
     
    208217While the documentation only gives limited insight into the scheduling and load balancing approach, \cite{apple:gcd2} suggests a fairly classic approach.
    209218Each \proc has a queue of \ats to run, called \newterm{blocks}, which are drained in \glsxtrshort{fifo}.
    210 \todo{update: They seem to add the concept of dependent queues with clear ordering, where executing a block ends-up scheduling more blocks.
    211 In terms of semantics, these Dispatch Queues seem to be very similar to Intel\textregistered ~TBB \lstinline{execute()} and predecessor semantics.}
    212 
     219GCD also has secondary queues, called \newterm{Dispatch Queues}, with clear ordering, where executing a block ends up scheduling more blocks.
     220In terms of semantics, these Dispatch Queues seem to be very similar to Intel\textregistered ~TBB \lstinline{execute()} and predecessor semantics.
     221
     222The similarity of API and semantics between GCD and Intel\textregistered ~TBB suggest the underlying scheduling algorithms are similar.
    213223
    214224\paragraph{LibFibre}
    215 LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developed at the University of Waterloo.
    216 Similarly to Go, it uses a variation of work stealing with a global queue that is higher priority than stealing.
     225LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a lightweight user-level threading framework developed at the University of Waterloo.
     226Similarly to Go, it uses a variation of work stealing with a global queue that has a higher priority than stealing.
    217227Unlike Go, it does not have the high-priority next ``chair'' and does not use randomized work-stealing.
  • doc/theses/thierry_delisle_PhD/thesis/text/front.tex

    rebf8ca5 r23a08aa0  
    3939                \vspace*{2.0cm}
    4040
    41                 Waterloo, Ontario, Canada, 2021 \\
    42 
    43                 \vspace*{1.0cm}
    44 
    45                 \copyright\ Thierry Delisle 2021 \\
     41                Waterloo, Ontario, Canada, 2022 \\
     42
     43                \vspace*{1.0cm}
     44
     45                \copyright\ Thierry Delisle 2022 \\
    4646        \end{center}
    4747\end{titlepage}
     
    6060\noindent
    6161        The following served on the Examining Committee for this thesis. The decision of the Examining Committee is by majority vote.
    62         \todo{External Examiners}
    63 \bigskip
    64 
    65 \noindent
    66 \begin{tabbing}
    67         Internal-External Member: \=  \kill % using longest text to define tab length
    68         External Examiner: \>  TBD \\
    69         \> TBD \\
     62\bigskip
     63
     64\noindent
     65\begin{tabbing}
     66        Internal-External Member: \=  \kill % using longest text to define tab length
     67        External Examiner: \>  Doug Lea \\
     68        \> Professor, Computer Science Department \\
     69        \> State University of New York at Oswego \\
    7070\end{tabbing}
    7171\bigskip
     
    9696\begin{tabbing}
    9797        Internal-External Member: \=  \kill % using longest text to define tab length
    98         Internal-External Member: \> TBD \\
    99         \> TBD \\
     98        Internal-External Member: \> Patrick Lam \\
     99        \> Associate Professor, Department of Electrical and Computer Engineering \\
    100100        \> University of Waterloo \\
    101101\end{tabbing}
     
    124124
    125125User-Level threading (M:N) is gaining popularity over kernel-level threading (1:1) in many programming languages.
    126 The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multi-core systems.
     126The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multicore systems.
    127127Indeed, over-partitioning into small work-units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities.
    128128To manage these high levels of concurrency, the underlying runtime must efficiently schedule many user threads across a few kernel threads;
    129 which begs of the question of how many kernel threads are needed and should the number be dynamically reevaluated.
     129which begs the question of how many kernel threads are needed and should the number be dynamically reevaluated.
    130130Furthermore, scheduling must prevent kernel threads from blocking, otherwise user-thread parallelism drops.
    131 When user-threading parallelism does drop, how and when should idle kernel-threads be put to sleep to avoid wasting CPU resources.
     131When user-threading parallelism does drop, how and when should idle \glspl{kthrd} be put to sleep to avoid wasting CPU resources.
    132132Finally, the scheduling system must provide fairness to prevent a user thread from monopolizing a kernel thread;
    133 otherwise other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.
    134 
    135 This thesis analyses multiple scheduler systems, where each system attempts to fulfill the necessary requirements for user-level threading.
    136 The predominant technique for managing high levels of concurrency is sharding the ready-queue with one queue per kernel-thread and using some form of work stealing/sharing to dynamically rebalance workload shifts.
    137 Preventing kernel blocking is accomplish by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.
     133otherwise, other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.
     134
     135This thesis analyses multiple scheduler systems, where each system attempts to fulfill the requirements for user-level threading.
     136The predominant technique for managing high levels of concurrency is sharding the ready queue with one queue per \gls{kthrd} and using some form of work stealing/sharing to dynamically rebalance workload shifts.
     137Preventing kernel blocking is accomplished by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.
    138138Fairness is handled through preemption and/or ad-hoc solutions, which leads to coarse-grained fairness with some pathological cases.
    139139
     
    146146The new scheduler also includes support for implicit nonblocking \io, allowing applications to have more user-threads blocking on \io operations than there are \glspl{kthrd}.
    147147The implementation is based on @io_uring@, a recent addition to the Linux kernel, and achieves the same performance and fairness as systems using @select@, @epoll@, \etc.
    148 To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside of the application.
     148To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside the application.
    149149
    150150\cleardoublepage
     
    179179\phantomsection    % allows hyperref to link to the correct page
    180180
     181% L I S T   O F   F I G U R E S
     182% -----------------------------
     183\addcontentsline{toc}{chapter}{List of Figures}
     184\listoffigures
     185\cleardoublepage
     186\phantomsection         % allows hyperref to link to the correct page
     187
    181188% L I S T   O F   T A B L E S
    182189% ---------------------------
     
    186193\phantomsection         % allows hyperref to link to the correct page
    187194
    188 % L I S T   O F   F I G U R E S
    189 % -----------------------------
    190 \addcontentsline{toc}{chapter}{List of Figures}
    191 \listoffigures
    192 \cleardoublepage
    193 \phantomsection         % allows hyperref to link to the correct page
    194 
    195195% GLOSSARIES (Lists of definitions, abbreviations, symbols, etc. provided by the glossaries-extra package)
    196196% -----------------------------
     
    199199\phantomsection         % allows hyperref to link to the correct page
    200200
    201 % TODOs and missing citations
    202 % -----------------------------
    203 \listofcits
    204 \listoftodos
    205 \cleardoublepage
    206 \phantomsection         % allows hyperref to link to the correct page
    207 
    208 
    209201% Change page numbering back to Arabic numerals
    210202\pagenumbering{arabic}
  • doc/theses/thierry_delisle_PhD/thesis/text/intro.tex

    rebf8ca5 r23a08aa0  
    22
    33\Gls{uthrding} (M:N) is gaining popularity over kernel-level threading (1:1) in many programming languages.
    4 The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multi-core systems.
    5 Indeed, over-partitioning into small work-units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities.
     4The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multicore systems.
     5Indeed, over-partitioning into small work units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities.
    66To manage these high levels of concurrency, the underlying runtime must efficiently schedule many user threads across a few kernel threads;
    7 which begs of the question of how many kernel threads are needed and should the number be dynamically reevaluated.
     7which begs the question of how many kernel threads are needed and should the number be dynamically reevaluated.
    88Furthermore, scheduling must prevent kernel threads from blocking, otherwise user-thread parallelism drops.
    99When user-threading parallelism does drop, how and when should idle kernel-threads be put to sleep to avoid wasting CPU resources.
    1010Finally, the scheduling system must provide fairness to prevent a user thread from monopolizing a kernel thread;
    11 otherwise other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.
     11otherwise, other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.
    1212
    13 This thesis analyses multiple scheduler systems, where each system attempts to fulfill the necessary requirements for \gls{uthrding}.
    14 The predominant technique for managing high levels of concurrency is sharding the ready-queue with one queue per kernel-thread and using some form of work stealing/sharing to dynamically rebalance workload shifts.
    15 Preventing kernel blocking is accomplish by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.
    16 Fairness is handled through preemption and/or ad-hoc solutions, which leads to coarse-grained fairness with some pathological cases.
     13This thesis analyzes multiple scheduler systems, where each system attempts to fulfill the requirements for \gls{uthrding}.
     14The predominant technique for managing high levels of concurrency is sharding the ready queue with one queue per kernel thread and using some form of work stealing/sharing to dynamically rebalance workload shifts.
     15Preventing kernel blocking is accomplished by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.
     16Fairness is handled through preemption and/or ad hoc solutions, which leads to coarse-grained fairness with some pathological cases.
    1717
    18 After examining, testing and selecting specific approaches to these scheduling issues, a completely new scheduler was created and tested in the \CFA (C-for-all) user-threading runtime-system.
     18After examining, testing and selecting specific approaches to these scheduling issues, a completely new scheduler was created and tested in the \CFA (C-for-all) user-threading runtime system.
    1919The goal of the new scheduler is to offer increased safety and productivity without sacrificing performance.
    20 The quality of the new scheduler is demonstrated by comparing it with other user-threading work-stealing schedulers with the aim of showing equivalent or better performance while offering better fairness.
     20The quality of the new scheduler is demonstrated by comparing it with other user-threading work-stealing schedulers with, the aim of showing equivalent or better performance while offering better fairness.
    2121
    2222Chapter~\ref{intro} defines scheduling and its general goals.
    2323Chapter~\ref{existing} discusses how scheduler implementations attempt to achieve these goals, but all implementations optimize some workloads better than others.
    24 Chapter~\ref{cfaruntime} presents the relevant aspects of the \CFA runtime system that have a significant affect on the new scheduler design and implementation.
    25 Chapter~\ref{core} analyses different scheduler approaches, while looking for scheduler mechanisms that provide both performance and fairness.
     24Chapter~\ref{cfaruntime} presents the relevant aspects of the \CFA runtime system that have a significant effect on the new scheduler design and implementation.
     25Chapter~\ref{core} analyses different scheduler approaches while looking for scheduler mechanisms that provide both performance and fairness.
    2626Chapter~\ref{userio} covers the complex mechanisms that must be used to achieve nonblocking I/O to prevent the blocking of \glspl{kthrd}.
    2727Chapter~\ref{practice} presents the mechanisms needed to adjust the amount of parallelism, both manually and automatically.
     
    2929
    3030
    31 \section{Scheduling}
     31\section{Scheduling}\label{sched}
    3232Computer systems share multiple resources across many threads of execution, even on single-user computers like laptops or smartphones.
    33 On a computer system with multiple processors and work units (routines, coroutines, threads, programs, \etc), there exists the problem of mapping many different kinds of work units onto many different kinds of processors in an efficient manner, called \newterm{scheduling}.
     33On a computer system with multiple processors and work units (routines, coroutines, threads, programs, \etc), there exists the problem of mapping many different kinds of work units onto many different kinds of processors efficiently, called \newterm{scheduling}.
    3434Scheduling systems are normally \newterm{open}, meaning new work arrives from an external source or is randomly spawned from an existing work unit.
    3535In general, work units without threads, like routines and coroutines, are self-scheduling, while work units with threads, like tasks and programs, are scheduled.
    3636For scheduled work-units, a scheduler takes a sequence of threads and attempts to run them to completion, subject to shared resource restrictions and utilization.
    37 A general-purpose dynamic-scheduler for an open system cannot anticipate work requests, so its performance is rarely optimal.
    38 Even with complete knowledge of arrive order and work, creating an optimal solution is a bin packing problem~\cite{wiki:binpak}.
     37In an open system, a general-purpose dynamic scheduler cannot anticipate work requests, so its performance is rarely optimal.
     38Even with complete knowledge of arrival order and work, creating an optimal solution is a bin packing problem~\cite{wiki:binpak}.
    3939However, optimal solutions are often not required: schedulers often produce excellent solutions, without needing optimality, by taking advantage of regularities in work patterns.
    4040
    41 Scheduling occurs at discreet points when there are transitions in a system.
    42 For example, a thread cycles through the following transitions during its execution.
     41Scheduling occurs at discrete points when there are transitions in a system.
     42For example, a \at cycles through the following transitions during its execution.
    4343\begin{center}
    4444\input{executionStates.pstex_t}
     
    4949entering the system (new $\rightarrow$ ready)
    5050\item
    51 scheduler assigns a thread to a computing resource, \eg CPU (ready $\rightarrow$ running)
     51scheduler assigns a \at to a computing resource, \eg CPU (ready $\rightarrow$ running)
    5252\item
    5353timer alarm for preemption (running $\rightarrow$ ready)
    5454\item
    55 long term delay versus spinning (running $\rightarrow$ blocked)
     55long-term delay versus spinning (running $\rightarrow$ blocked)
    5656\item
    5757completion of delay, \eg network or I/O completion (blocked $\rightarrow$ ready)
     
    5959normal completion or error, \eg segment fault (running $\rightarrow$ halted)
    6060\end{itemize}
    61 Key to scheduling is that a thread cannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system, \ie no self-scheduling among threads.
     61Key to scheduling is that a \at cannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system, \ie no self-scheduling among threads.
    6262
    6363When the workload exceeds the capacity of the processors, \ie work cannot be executed immediately, it is placed on a queue for subsequent service, called a \newterm{ready queue}.
     
    7171\end{tabular}
    7272\end{center}
    73 Beyond these two schedulers are a host of options, \eg adding an global shared queue to MQMS or adding multiple private queues with distinc characteristics.
     73Beyond these two schedulers are a host of options, \eg adding a global shared queue to MQMS or adding multiple private queues with distinct characteristics.
    7474
    7575Once there are multiple resources and ready queues, a scheduler is faced with three major optimization criteria:
     
    8484
    8585\noindent
    86 Essentially, all multi-processor computers have non-uniform memory access (NUMA), with one or more quantized steps to access data at different levels in the memory hierarchy.
     86Essentially, all multiprocessor computers have non-uniform memory access (NUMA), with one or more quantized steps to access data at different levels in the memory hierarchy.
    8787When a system has a large number of independently executing threads, affinity becomes difficult because of \newterm{thread churn}.
    88 That is, threads must be scheduled on different processors to obtain high processors utilization because the number of threads $\ggg$ processors.
     88That is, threads must be scheduled on different processors to obtain high processor utilization because the number of threads $\ggg$ processors.
    8989
    9090\item
    9191\newterm{contention}: safe access of shared objects by multiple processors requires mutual exclusion in some form, generally locking.\footnote{
    9292Lock-free data-structures do not involve locking but incur similar costs to achieve mutual exclusion.}
    93 Mutual exclusion cost and latency increases significantly with the number of processors access\-ing a shared object.
     93Mutual exclusion cost and latency increase significantly with the number of processors access\-ing a shared object.
    9494\end{enumerate}
    9595
     
    116116
    117117Since \CFA attempts to improve the safety and productivity of C, the new scheduler presented in this thesis attempts to achieve the same goals.
    118 More specifically, safety and productivity for scheduling means supporting a wide range of workloads so that programmers can rely on progress guarantees (safety) and more easily achieve acceptable performance (productivity).
     118More specifically, safety and productivity for scheduling mean supporting a wide range of workloads so that programmers can rely on progress guarantees (safety) and more easily achieve acceptable performance (productivity).
    119119The new scheduler also includes support for implicit nonblocking \io, allowing applications to have more user-threads blocking on \io operations than there are \glspl{kthrd}.
    120 To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside of the application.
     120To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside the application.
    121121
    122 As a research project, this work builds exclusively on newer versions of the Linux operating-system and gcc/clang compilers.
     122As a research project, this work builds exclusively on newer versions of the Linux operating system and gcc/clang compilers.
    123123The new scheduler implementation uses several optimizations to successfully balance the cost of fairness against performance;
    124124some of these optimizations rely on interesting hardware optimizations only present on modern CPUs.
    125 The \io implementation is based on the @io_uring@ kernel-interface, a recent addition to the Linux kernel, because it purports to handle nonblocking \emph{file} and network \io.
     125The \io implementation is based on the @io_uring@ kernel interface, a recent addition to the Linux kernel, because it purports to handle nonblocking \emph{file} and network \io.
    126126This decision allowed an interesting performance and fairness comparison with other threading systems using @select@, @epoll@, \etc.
    127127While the current \CFA release supports older versions of Linux ($\ge$~Ubuntu 16.04) and gcc/clang compilers ($\ge$~gcc 6.0), it is not the purpose of this project to find workarounds in these older systems to provide backwards compatibility.
     
    129129
    130130\section{Contributions}\label{s:Contributions}
    131 This work provides the following scheduling contributions for advanced \gls{uthrding} runtime-systems:
     131This work provides the following scheduling contributions for advanced \gls{uthrding} runtime systems:
    132132\begin{enumerate}[leftmargin=*]
    133133\item
     
    140140A mechanism for adding fairness on top of MQMS algorithm through helping, used both for scalable scheduling algorithm and the user-level \glsxtrshort{io}.
    141141\item
    142 An optimization of the helping-mechanism for load balancing to reduce scheduling costs.
     142An optimization of the helping mechanism for load balancing to reduce scheduling costs.
    143143\item
    144144An optimization for the alternative relaxed-list for load balancing to reduce scheduling costs in embarrassingly parallel cases.
  • doc/theses/thierry_delisle_PhD/thesis/text/io.tex

    rebf8ca5 r23a08aa0  
    11\chapter{User Level \io}\label{userio}
    2 As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations.
    3 Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating-system.
     2As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \ats onto fewer \glspl{proc} using asynchronous \io operations.
     3Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating system.
    44
    55\section{Kernel Interface}
     
    1313In this context, ready means \emph{some} operation can be performed without blocking.
    1414It does not mean an operation returning \lstinline{EAGAIN} succeeds on the next try.
    15 For example, a ready read may only return a subset of requested bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}
    16 This mechanism is also crucial in determining when all \glspl{thrd} are blocked and the application \glspl{kthrd} can now block.
     15For example, a ready read may only return a subset of requested bytes and the read must be issued again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}
     16This mechanism is also crucial in determining when all \ats are blocked and the application \glspl{kthrd} can now block.
    1717
    1818There are three options to monitor file descriptors in Linux:\footnote{
    1919For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}.
    20 The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.},
     20The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.}
    2121@select@~\cite{MAN:select}, @poll@~\cite{MAN:poll} and @epoll@~\cite{MAN:epoll}.
    2222All three of these options offer a system call that blocks a \gls{kthrd} until at least one of many file descriptors becomes ready.
     
    3030Hence, if one \gls{kthrd} is managing the select calls, other threads can only add/remove to/from the manager's interest set through synchronized calls to update the interest set.
    3131However, these changes are only reflected when the manager makes its next call to @select@.
    32 Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/ttys it is waiting on never get data again.
     32Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/TTYs it is waiting on never get data again.
    3333Often the I/O manager has a timeout, polls, or is sent a signal on changes to mitigate this problem.
    34 
    35 \begin{comment}
    36 From: Tim Brecht <brecht@uwaterloo.ca>
    37 Subject: Re: FD sets
    38 Date: Wed, 6 Jul 2022 00:29:41 +0000
    39 
    40 Large number of open files
    41 --------------------------
    42 
    43 In order to be able to use more than the default number of open file
    44 descriptors you may need to:
    45 
    46 o increase the limit on the total number of open files /proc/sys/fs/file-max
    47   (on Linux systems)
    48 
    49 o increase the size of FD_SETSIZE
    50   - the way I often do this is to figure out which include file __FD_SETSIZE
    51     is defined in, copy that file into an appropriate directory in ./include,
    52     and then modify it so that if you use -DBIGGER_FD_SETSIZE the larger size
    53     gets used
    54 
    55   For example on a RH 9.0 distribution I've copied
    56   /usr/include/bits/typesizes.h into ./include/i386-linux/bits/typesizes.h
    57 
    58   Then I modify typesizes.h to look something like:
    59 
    60   #ifdef BIGGER_FD_SETSIZE
    61   #define __FD_SETSIZE            32767
    62   #else
    63   #define __FD_SETSIZE            1024
    64   #endif
    65 
    66   Note that the since I'm moving and testing the userver on may different
    67   machines the Makefiles are set up to use -I ./include/$(HOSTTYPE)
    68 
    69   This way if you redefine the FD_SETSIZE it will get used instead of the
    70   default original file.
    71 \end{comment}
    7234
    7335\paragraph{\lstinline{poll}} is the next oldest option, and takes as input an array of structures containing the FD numbers rather than their position in an array of bits, allowing a more compact input for interest sets that contain widely spaced FDs.
    7436For small interest sets with densely packed FDs, the @select@ bit mask can take less storage, and hence, copy less information into the kernel.
    75 Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialize on every call.
    76 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \gls{kthrd}, while a manager thread is blocked in @poll@.
     37Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialized on every call.
     38Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \glspl{kthrd}, while a manager thread is blocked in @poll@.
    7739
    7840\paragraph{\lstinline{epoll}} follows after @poll@, and places the interest set in the kernel rather than the application, where it is managed by an internal \gls{kthrd}.
     
    8446However, all three of these I/O systems have limitations.
    8547The @man@ page for @O_NONBLOCK@ mentions that ``[@O_NONBLOCK@] has no effect for regular files and block devices'', which means none of these three system calls are viable multiplexing strategies for these types of \io operations.
    86 Furthermore, @epoll@ has been shown to have problems with pipes and ttys~\cit{Peter's examples in some fashion}.
     48Furthermore, TTYs can also be tricky to use since they can take different forms based on how the command is executed.
     49For example, @epoll@ rejects FDs pointing to regular files or block devices, which includes @stdin@ when using shell redirections~\cite[\S~3.6]{MAN:bash}, but does not reject shell pipelines~\cite[\S~3.2.3]{MAN:bash}, which includes pipelines into @stdin@.
    8750Finally, none of these are useful solutions for multiplexing \io operations that do not have a corresponding file descriptor and can be awkward for operations using multiple file descriptors.
    8851
     
    9053An alternative to @O_NONBLOCK@ is the AIO interface.
    9154Its interface lets programmers enqueue operations to be performed asynchronously by the kernel.
    92 Completions of these operations can be communicated in various ways: either by spawning a new \gls{kthrd}, sending a Linux signal, or by polling for completion of one or more operation.
    93 For this work, spawning a new \gls{kthrd} is counter-productive but a related solution is discussed in Section~\ref{io:morethreads}.
    94 Using interrupts handlers can also lead to fairly complicated interactions between subsystems and has non-trivial cost.
     55Completions of these operations can be communicated in various ways: either by spawning a new \gls{kthrd}, sending a Linux signal, or polling for completion of one or more operations.
     56For this work, spawning a new \gls{kthrd} is counterproductive but a related solution is discussed in Section~\ref{io:morethreads}.
     57Using interrupt handlers can also lead to fairly complicated interactions between subsystems and has a non-trivial cost.
    9558Leaving polling for completion, which is similar to the previous system calls.
    9659AIO only supports read and write operations to file descriptors, it does not have the same limitation as @O_NONBLOCK@, \ie, the file descriptors can be regular files and blocked devices.
    9760It also supports batching multiple operations in a single system call.
    9861
    99 AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.
    100 For the purpose of \io multiplexing, @aio_suspend@ is the best interface.
     62AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have been completed.
     63For \io multiplexing, @aio_suspend@ is the best interface.
    10164However, even if AIO requests can be submitted concurrently, @aio_suspend@ suffers from the same limitation as @select@ and @poll@, \ie, the interest set cannot be dynamically changed while a call to @aio_suspend@ is in progress.
    102 AIO also suffers from the limitation of specifying which requests have completed, \ie programmers have to poll each request in the interest set using @aio_error@ to identify the completed requests.
     65AIO also suffers from the limitation of specifying which requests have been completed, \ie programmers have to poll each request in the interest set using @aio_error@ to identify the completed requests.
    10366This limitation means that, like @select@ and @poll@ but not @epoll@, the time needed to examine polling results increases based on the total number of requests monitored, not the number of completed requests.
    10467Finally, AIO does not seem to be a popular interface, which I believe is due in part to this poor polling interface.
     
    12487in
    12588``some kind of arbitrary \textit{queue up asynchronous system call} model''.
    126 This description is actually quite close to the interface described in the next section.
     89This description is quite close to the interface described in the next section.
    12790
    12891\subsection{\lstinline{io_uring}}
     
    13598In addition to supporting reads and writes to any file descriptor like AIO, it supports other operations like @open@, @close@, @fsync@, @accept@, @connect@, @send@, @recv@, @splice@, \etc.
    13699
    137 On top of these, @io_uring@ adds many extras like avoiding copies between the kernel and user-space using shared memory, allowing different mechanisms to communicate with device drivers, and supporting chains of requests, \ie, requests that automatically trigger followup requests on completion.
     100On top of these, @io_uring@ adds many extras like avoiding copies between the kernel and user space using shared memory, allowing different mechanisms to communicate with device drivers, and supporting chains of requests, \ie, requests that automatically trigger follow-up requests on completion.
    138101
    139102\subsection{Extra Kernel Threads}\label{io:morethreads}
    140 Finally, if the operating system does not offer a satisfactory form of asynchronous \io operations, an ad-hoc solution is to create a pool of \glspl{kthrd} and delegate operations to it to avoid blocking \glspl{proc}, which is a compromise for multiplexing.
    141 In the worst case, where all \glspl{thrd} are consistently blocking on \io, it devolves into 1-to-1 threading.
    142 However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \glspl{thrd} are ready to run.
     103Finally, if the operating system does not offer a satisfactory form of asynchronous \io operations, an ad hoc solution is to create a pool of \glspl{kthrd} and delegate operations to it to avoid blocking \glspl{proc}, which is a compromise for multiplexing.
     104In the worst case, where all \ats are consistently blocking on \io, it devolves into 1-to-1 threading.
     105However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \ats are ready to run.
    143106This approach is used by languages like Go~\cite{GITHUB:go}, frameworks like libuv~\cite{libuv}, and web servers like Apache~\cite{apache} and NGINX~\cite{nginx}, since it has the advantage that it can easily be used across multiple operating systems.
    144107This advantage is especially relevant for languages like Go, which offer a homogeneous \glsxtrshort{api} across all platforms.
    145 As opposed to C, which has a very limited standard api for \io, \eg, the C standard library has no networking.
     108As opposed to C, which has a very limited standard \glsxtrshort{api} for \io, \eg, the C standard library has no networking.
    146109
    147110\subsection{Discussion}
     
    155118\section{Event-Engine}
    156119An event engine's responsibility is to use the kernel interface to multiplex many \io operations onto few \glspl{kthrd}.
    157 In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engine then starts an operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.
    158 The parked \glspl{thrd} are then rescheduled by the event engine once the desired operation has completed.
     120In concrete terms, this means \ats enter the engine through an interface, the event engine then starts an operation and parks the calling \ats, returning control to the \gls{proc}.
     121The parked \ats are then rescheduled by the event engine once the desired operation has been completed.
    159122
    160123\subsection{\lstinline{io_uring} in depth}\label{iouring}
     
    171134        \centering
    172135        \input{io_uring.pstex_t}
    173         \caption[Overview of \lstinline{io_uring}]{Overview of \lstinline{io_uring} \smallskip\newline Two ring buffer are used to communicate with the kernel, one for completions~(right) and one for submissions~(left). The submission ring indexes into a pre-allocated array (denoted \emph{S}) instead.}
     136        \caption[Overview of \lstinline{io_uring}]{Overview of \lstinline{io_uring} \smallskip\newline Two ring buffers are used to communicate with the kernel, one for completions~(right) and one for submissions~(left). The submission ring indexes into a pre-allocated array (denoted \emph{S}) instead.}
    174137        \label{fig:iouring}
    175138\end{figure}
     
    184147\item
    185148The SQE is filled according to the desired operation.
    186 This step is straight forward.
    187 The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries.
     149This step is straightforward.
     150The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled to match submission and completion entries.
    188151\item
    189152The SQE is submitted to the submission ring by appending the index of the SQE to the ring following regular ring buffer steps: \lstinline{buffer[head] = item; head++}.
     
    207170
    208171The @io_uring_enter@ system call is protected by a lock inside the kernel.
    209 This protection means that concurrent call to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@.
     172This protection means that concurrent calls to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@.
    210173It is possible to do the first three submission steps in parallel;
    211174however, doing so requires careful synchronization.
     
    216179This restriction means \io request bursts may have to be subdivided and submitted in chunks at a later time.
    217180
    218 An important detail to keep in mind is that just like ``The cloud is just someone else's computer''\cite{xkcd:cloud}, asynchronous operations are just operation using someone else's threads.
    219 Indeed, asynchronous operation can require computation time to complete, which means that if this time is not taken from the thread that triggered the asynchronous operation, it must be taken from some other threads.
     181An important detail to keep in mind is that just like ``The cloud is just someone else's computer''\cite{xkcd:cloud}, asynchronous operations are just operations using someone else's threads.
     182Indeed, asynchronous operations can require computation time to complete, which means that if this time is not taken from the thread that triggered the asynchronous operation, it must be taken from some other threads.
    220183In this case, the @io_uring@ operations that cannot be handled directly in the system call must be delegated to some other \gls{kthrd}.
    221184To this end, @io_uring@ maintains multiple \glspl{kthrd} inside the kernel that are not exposed to the user.
    222 There are three kind of operations that can need the \glspl{kthrd}:
     185Three kinds of operations that can need the \glspl{kthrd}:
    223186
    224187\paragraph{Operations using} @IOSQE_ASYNC@.
     
    228191This is also a fairly simple case. As mentioned earlier in this chapter, [@O_NONBLOCK@] has no effect for regular files and block devices.
    229192@io_uring@ must also take this reality into account by delegating operations on regular files and block devices.
    230 In fact @io_uring@ maintains a pool of \glspl{kthrd} dedicated to these operations, which are referred to as \newterm{bounded workers}.
     193In fact, @io_uring@ maintains a pool of \glspl{kthrd} dedicated to these operations, which are referred to as \newterm{bounded workers}.
    231194
    232195\paragraph{Unbounded operations that must be retried.}
     
    235198@io_uring@ maintains a separate pool for these operations.
    236199The \glspl{kthrd} in this pool are referred to as \newterm{unbounded workers}.
    237 Unbounded workers are also responsible of handling operations using @IOSQE_ASYNC@.
     200Unbounded workers are also responsible for handling operations using @IOSQE_ASYNC@.
    238201
    239202@io_uring@ implicitly spawns and joins both the bounded and unbounded workers based on its evaluation of the needs of the workload.
    240203This effectively encapsulates the work that is needed when using @epoll@.
    241 Indeed, @io_uring@ does not change Linux's underlying handling of \io opeartions, it simply offers an asynchronous \glsxtrshort{api} on top of the existing system.
     204Indeed, @io_uring@ does not change Linux's underlying handling of \io operations, it simply offers an asynchronous \glsxtrshort{api} on top of the existing system.
    242205
    243206
    244207\subsection{Multiplexing \io: Submission}
    245208
    246 The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made in the submission side.
     209The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made on the submission side.
    247210While there is freedom in designing the submission side, there are some realities of @io_uring@ that must be taken into account.
    248211It is possible to do the first steps of submission in parallel;
     
    255218As described in Chapter~\ref{practice}, this does not translate into constant CPU usage.}.
    256219Note that once an operation completes, there is nothing that ties it to the @io_uring@ instance that handled it.
    257 There is nothing preventing a new operation with, \eg the same file descriptors to a different @io_uring@ instance.
     220Nothing preventing a new operation, with for example the same file descriptor, to use a different @io_uring@ instance.
    258221
    259222A complicating aspect of submission is @io_uring@'s support for chains of operations, where the completion of an operation triggers the submission of the next operation on the link.
     
    263226Support for this feature can be fulfilled simply by supporting arbitrary user code between allocation and submission.
    264227
    265 Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.
     228Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \proc, in decoupled pools, \ie, a pool of \procs using a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.
    266229These three sharding approaches are analyzed.
    267230
    268231\subsubsection{Private Instances}
    269232The private approach creates one ring instance per \gls{proc}, \ie one-to-one coupling.
    270 This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not time-sliced during submission steps.
    271 This requirement is the same as accessing @thread_local@ variables, where a \gls{thrd} is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data.
    272 This failure is the serially reusable problem~\cite{SeriallyReusable}.
    273 Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in allocation order.\footnote{
    274 To remove this requirement, a \gls{thrd} needs the ability to ``yield to a specific \gls{proc}'', \ie, park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.}
     233This alleviates the need for synchronization on the submissions, requiring only that \ats are not time-sliced during submission steps.
     234This requirement is the same as accessing @thread_local@ variables, where a \at is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data.
     235This failure is the \newterm{serially reusable problem}~\cite{SeriallyReusable}.
     236Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in order of allocation.\footnote{
     237To remove this requirement, a \at needs the ability to ``yield to a specific \gls{proc}'', \ie, \park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.}
    275238From the subsystem's point of view, the allocation and submission are sequential, greatly simplifying both.
    276239In this design, allocation and submission form a partitioned ring buffer as shown in Figure~\ref{fig:pring}.
    277 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to perform the system call.
    278 Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, \etc.
     240Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regard to when to perform the system call.
     241Possible options are: when the \gls{proc} runs out of \ats to run, after running a given number of \ats, \etc.
    279242
    280243\begin{figure}
    281244        \centering
    282245        \input{pivot_ring.pstex_t}
    283         \caption[Partitioned ring buffer]{Partitioned ring buffer \smallskip\newline Allocated sqes are appending to the first partition.
     246        \caption[Partitioned ring buffer]{Partitioned ring buffer \smallskip\newline Allocated sqes are appended to the first partition.
    284247        When submitting, the partition is advanced.
    285248        The kernel considers the partition as the head of the ring.}
     
    288251
    289252This approach has the advantage that it does not require much of the synchronization needed in a shared approach.
    290 However, this benefit means \glspl{thrd} submitting \io operations have less flexibility: they cannot park or yield, and several exceptional cases are handled poorly.
    291 Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations.
    292 In this case, the \io \gls{thrd} needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed.
     253However, this benefit means \ats submitting \io operations have less flexibility: they cannot \park or yield, and several exceptional cases are handled poorly.
     254Instances running out of SQEs cannot run \ats wanting to do \io operations.
     255In this case, the \io \at needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed.
    293256
    294257A more involved version of this approach tries to solve these problems using a pattern called \newterm{helping}.
    295 \Glspl{thrd} that cannot submit \io operations, either because of an allocation failure or migration to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.
    296 While there is still the strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \glspl{thrd} to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request.
    297 
    298 Imagine a simple scenario with two \glspl{thrd} on two \glspl{proc}, where one \gls{thrd} submits an \io operation and then sets a flag, while the other \gls{thrd} spins until the flag is set.
    299 Assume both \glspl{thrd} are running on the same \gls{proc}, and the \io \gls{thrd} is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \gls{thrd}.
    300 In this case, the helping solution has the \io \gls{thrd} append an \io object to the submission list of the first \gls{proc}, where the allocation was made.
    301 No other \gls{proc} can help the \gls{thrd} since @io_uring@ instances are strongly coupled to \glspl{proc}.
    302 However, the \io \gls{proc} is unable to help because it is executing the spinning \gls{thrd} resulting in a deadlock.
    303 While this example is artificial, in the presence of many \glspl{thrd}, it is possible for this problem to arise ``in the wild''.
     258\ats that cannot submit \io operations, either because of an allocation failure or \glslink{atmig}{migration} to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.
     259While there is still a strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \ats to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request.
     260
     261Imagine a simple scenario with two \ats on two \glspl{proc}, where one \at submits an \io operation and then sets a flag, while the other \at spins until the flag is set.
     262Assume both \ats are running on the same \gls{proc}, and the \io \at is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \at.
     263In this case, the helping solution has the \io \at append an \io object to the submission list of the first \gls{proc}, where the allocation was made.
     264No other \gls{proc} can help the \at since @io_uring@ instances are strongly coupled to \glspl{proc}.
     265However, the \io \gls{proc} is unable to help because it is executing the spinning \at resulting in a deadlock.
     266While this example is artificial, in the presence of many \ats, this problem can arise ``in the wild''.
    304267Furthermore, this pattern is difficult to reliably detect and avoid.
    305 Once in this situation, the only escape is to interrupted the spinning \gls{thrd}, either directly or via some regular preemption, \eg time slicing.
    306 Having to interrupt \glspl{thrd} for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect.
     268Once in this situation, the only escape is to interrupt the spinning \at, either directly or via some regular preemption, \eg time slicing.
     269Having to interrupt \ats for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect.
    307270Interrupts are needed here entirely because the \gls{proc} is tied to an instance it is not using.
    308 Therefore, a more satisfying solution is for the \gls{thrd} submitting the operation to notice that the instance is unused and simply go ahead and use it.
     271Therefore, a more satisfying solution is for the \at submitting the operation to notice that the instance is unused and simply go ahead and use it.
    309272This approach is presented shortly.
    310273
    311274\subsubsection{Public Instances}
    312275The public approach creates decoupled pools of @io_uring@ instances and processors, \ie without one-to-one coupling.
    313 \Glspl{thrd} attempting an \io operation pick one of the available instances and submit the operation to that instance.
    314 Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently.
     276\ats attempting an \io operation pick one of the available instances and submit the operation to that instance.
     277Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \ats running on more than one \gls{proc} can attempt to submit to the same instance concurrently.
    315278Because @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects:
    316279\begin{itemize}
     
    319282\item
    320283The scheme to route \io requests to specific @io_uring@ instances does not introduce contention.
    321 This aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
     284This aspect has oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
    322285\end{itemize}
    323286
    324287Allocation in this scheme is fairly easy.
    325 Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written to safely and have a field called @user_data@ that the kernel only reads to copy to @cqe@s.
    326 Allocation also requires no ordering guarantee as all free SQEs are interchangeable.
     288Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written-to safely and have a field called @user_data@ that the kernel only reads to copy to CQEs.
     289Allocation also does not require ordering guarantees as all free SQEs are interchangeable.
    327290The only added complexity is that the number of SQEs is fixed, which means allocation can fail.
    328291
    329 Allocation failures need to be pushed to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.
    330 Furthermore, the routing algorithm should block operations up-front, if none of the instances have available SQEs.
    331 
    332 Once an SQE is allocated, \glspl{thrd} insert the \io request information, and keep track of the SQE index and the instance it belongs to.
    333 
    334 Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread-safe, and then the kernel must be notified using the @io_uring_enter@ system call.
    335 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean a \lstinline{sqe} multiple times in the ring buffer, which is undefined behaviour.
    336 However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations complete.
     292Allocation failures need to be pushed to a routing algorithm: \ats attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.
     293Furthermore, the routing algorithm should block operations upfront if none of the instances have available SQEs.
     294
     295Once an SQE is allocated, \ats insert the \io request information and keep track of the SQE index and the instance it belongs to.
     296
     297Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread safe, and then the kernel must be notified using the @io_uring_enter@ system call.
     298The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean an SQE multiple times in the ring buffer, which is undefined behaviour.
     299However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations are complete.
    337300
    338301Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency.
    339 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted.
    340 Balancing submission can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section.
    341 
    342 Ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \glspl{thrd} is designated to do the system call on behalf of the others, called the \newterm{submitter}.
     302Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long periods before being submitted.
     303Balancing submission can be handled by either designating one of the submitting \ats as the \at responsible for the system call for the current batch of SQEs or by having some other party regularly submit all ready SQEs, \eg, the poller \at mentioned later in this section.
     304
     305Ideally, when multiple \ats attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \ats is designated to do the system call on behalf of the others, called the \newterm{submitter}.
    343306However, in practice, \io requests must be handed promptly so there is a need to guarantee everything missed by the current submitter is seen by the next one.
    344 Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call includes their request.
    345 Once the system call is done, the submitter must also free SQEs so that the allocator can reused them.
    346 
    347 Finally, the completion side is much simpler since the @io_uring@ system-call enforces a natural synchronization point.
    348 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}.
    349 Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}.
     307Indeed, as long as there is a ``next'' submitter, \ats submitting new \io requests can move on, knowing that some future system call includes their request.
     308Once the system call is done, the submitter must also free SQEs so that the allocator can reuse them.
     309
     310Finally, the completion side is much simpler since the @io_uring@ system call enforces a natural synchronization point.
     311Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \ats.
     312Since CQEs only own a signed 32-bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}.
    350313If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events.
    351 A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled.
     314A simple approach to polling is to allocate a \at per @io_uring@ instance and simply let the poller \ats poll their respective instances when scheduled.
    352315
    353316With the pool of SEQ instances approach, the big advantage is that it is fairly flexible.
    354 It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions.
     317It does not impose restrictions on what \ats submitting \io operations can and cannot do between allocations and submissions.
    355318It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@.
    356 The down side to this approach is that many of the steps used for submitting need complex synchronization to work properly.
    357 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed.
     319The downside to this approach is that many of the steps used for submitting need complex synchronization to work properly.
     320The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \ats are already queued up waiting for SQEs and handle SQEs being freed.
    358321The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused, and handle the kernel returning @EBUSY@.
    359 All this synchronization has a significant cost, and compared to the private-instance approach, this synchronization is entirely overhead.
     322Compared to the private-instance approach, all this synchronization has a significant cost this synchronization is entirely overhead.
    360323
    361324\subsubsection{Instance borrowing}
    362325Both of the prior approaches have undesirable aspects that stem from tight or loose coupling between @io_uring@ and \glspl{proc}.
    363326The first approach suffers from tight coupling causing problems when a \gls{proc} does not benefit from the coupling.
    364 The second approach suffers from loose coupling causing operations to have synchronization overhead, which tighter coupling avoids.
     327The second approach suffers from loose couplings causing operations to have synchronization overhead, which tighter coupling avoids.
    365328When \glspl{proc} are continuously issuing \io operations, tight coupling is valuable since it avoids synchronization costs.
    366329However, in unlikely failure cases or when \glspl{proc} are not using their instances, tight coupling is no longer advantageous.
     
    370333
    371334In this approach, each cluster, see Figure~\ref{fig:system}, owns a pool of @io_uring@ instances managed by an \newterm{arbiter}.
    372 When a \gls{thrd} attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance.
    373 This instance is now bound to the \gls{proc} the \gls{thrd} is running on.
    374 This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.
     335When a \at attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance.
     336This instance is now bound to the \gls{proc} the \at is running on.
     337This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial \io state.
    375338This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at a time, akin to the private instances approach.
    376339However, it differs in that revocation by the arbiter means this approach does not suffer from the deadlock scenario described above.
     
    380343        \item The current \gls{proc} does not hold an instance.
    381344        \item The current instance does not have sufficient SQEs to satisfy the request.
    382         \item The current \gls{proc} has a wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission, called \newterm{external submissions}.
     345        \item The current \gls{proc} has a wrong instance, this happens if the submitting \at context-switched between allocation and submission, called \newterm{external submissions}.
    383346\end{enumerate}
    384347However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their instance ownership is not being revoked, which is accomplished by a lock-\emph{less} handshake.\footnote{
    385 Note the handshake is not lock \emph{free} since it lacks the proper progress guarantee.}
     348Note the handshake is not lock-\emph{free} since it lacks the proper progress guarantee.}
    386349A \gls{proc} raises a local flag before using its borrowed instance and checks if the instance is marked as revoked or if the arbiter has raised its flag.
    387350If not, it proceeds, otherwise it delegates the operation to the arbiter.
     
    389352
    390353Correspondingly, before revoking an instance, the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag.
    391 Only then does it reclaim the instance and potentially assign it to an other \gls{proc}.
     354Only then does it reclaim the instance and potentially assign it to another \gls{proc}.
    392355
    393356The arbiter maintains four lists around which it makes its decisions:
     
    406369
    407370\paragraph{Pending Allocations} are handled by the arbiter when it has available instances and can directly hand over the instance and satisfy the request.
    408 Otherwise, it must hold onto the list of threads until SQEs are made available again.
     371Otherwise, it must hold on to the list of threads until SQEs are made available again.
    409372This handling is more complex when an allocation requires multiple SQEs, since the arbiter must make a decision between satisfying requests in FIFO ordering or for fewer SQEs.
    410373
    411374While an arbiter has the potential to solve many of the problems mentioned above, it also introduces a significant amount of complexity.
    412 Tracking which processors are borrowing which instances and which instances have SQEs available ends-up adding a significant synchronization prelude to any I/O operation.
     375Tracking which processors are borrowing which instances and which instances have SQEs available ends up adding a significant synchronization prelude to any I/O operation.
    413376Any submission must start with a handshake that pins the currently borrowed instance, if available.
    414377An attempt to allocate is then made, but the arbiter can concurrently be attempting to allocate from the same instance from a different \gls{hthrd}.
    415378Once the allocation is completed, the submission must check that the instance is still burrowed before attempting to flush.
    416379These synchronization steps turn out to have a similar cost to the multiple shared-instances approach.
    417 Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end-up cycling the processors, which leads to significant cache deterioration.
     380Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end up cycling the processors, which leads to significant cache deterioration.
    418381For these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice.
    419 
    420 \subsubsection{Private Instances V2}
    421 
    422 % Verbs of this design
    423 
    424 % Allocation: obtaining an sqe from which to fill in the io request, enforces the io instance to use since it must be the one which provided the sqe. Must interact with the arbiter if the instance does not have enough sqe for the allocation. (Typical allocation will ask for only one sqe, but chained sqe must be allocated from the same context so chains of sqe must be allocated in bulks)
    425 
    426 % Submission: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.
    427 
    428 % Flushing: Taking all the sqes that were submitted and making them visible to the kernel, also counting them in order to figure out what to_submit should be. Must be thread-safe with submission. Has to interact with the Arbiter if there are external submissions. Can't simply use a protected queue because adding to the array is not safe if the ring is still available for submitters. Flushing must therefore: check if there are external pending requests if so, ask the arbiter to flush otherwise use the fast flush operation.
    429 
    430 % Collect: Once the system call is done, it returns how many sqes were consumed by the system. These must be freed for allocation. Must interact with the arbiter to notify that things are now ready.
    431 
    432 % Handle: process all the produced cqe. No need to interact with any of the submission operations or the arbiter.
    433 
    434 
    435 % alloc():
    436 %       proc.io->in_use = true, __ATOMIC_ACQUIRE
    437 %       if cltr.io.flag || !proc.io || proc.io->flag:
    438 %               return alloc_slow(cltr.io, proc.io)
    439 
    440 %       a = alloc_fast(proc.io)
    441 %       if a:
    442 %               proc.io->in_use = false, __ATOMIC_RELEASE
    443 %               return a
    444 
    445 %       return alloc_slow(cltr.io)
    446 
    447 % alloc_fast()
    448 %       left = proc.io->submit_q.free.tail - proc.io->submit_q.free.head
    449 %       if num_entries - left < want:
    450 %               return None
    451 
    452 %       a = ready[head]
    453 %       head = head + 1, __ATOMIC_RELEASE
    454 
    455 % alloc_slow()
    456 %       cltr.io.flag = true, __ATOMIC_ACQUIRE
    457 %       while(proc.io && proc.io->in_use) pause;
    458 
    459 
    460 
    461 % submit(a):
    462 %       proc.io->in_use = true, __ATOMIC_ACQUIRE
    463 %       if cltr.io.flag || proc.io != alloc.io || proc.io->flag:
    464 %               return submit_slow(cltr.io)
    465 
    466 %       submit_fast(proc.io, a)
    467 %       proc.io->in_use = false, __ATOMIC_RELEASE
    468 
    469 % polling()
    470 %       loop:
    471 %               yield
    472 %               flush()
    473 %               io_uring_enter
    474 %               collect
    475 %               handle()
    476382
    477383\section{Interface}
    478384The last important part of the \io subsystem is its interface.
    479 There are multiple approaches that can be offered to programmers, each with advantages and disadvantages.
    480 The new \io subsystem can replace the C runtime API or extend it, and in the later case, the interface can go from very similar to vastly different.
     385Multiple approaches can be offered to programmers, each with advantages and disadvantages.
     386The new \io subsystem can replace the C runtime API or extend it, and in the latter case, the interface can go from very similar to vastly different.
    481387The following sections discuss some useful options using @read@ as an example.
    482 The standard Linux interface for C is :
     388The standard Linux interface for C is:
    483389\begin{cfa}
    484390ssize_t read(int fd, void *buf, size_t count);
     
    492398However, this approach also entails a plethora of subtle technical challenges, which generally boils down to making a perfect replacement.
    493399If the \CFA interface replaces only \emph{some} of the calls to glibc, then this can easily lead to esoteric concurrency bugs.
    494 Since the gcc ecosystems does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible.
     400Since the gcc ecosystem does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible.
    495401
    496402\subsection{Synchronous Extension}
     
    503409It comes with the caveat that any code attempting to use it must be recompiled, which is a problem considering the amount of existing legacy C binaries.
    504410However, it has the advantage of implementation simplicity.
    505 Finally, there is a certain irony to using a blocking synchronous interfaces for a feature often referred to as ``non-blocking'' \io.
     411Finally, there is a certain irony to using a blocking synchronous interface for a feature often referred to as ``non-blocking'' \io.
    506412
    507413\subsection{Asynchronous Extension}
     
    531437This offers more flexibility to users wanting to fully utilize all of the @io_uring@ features.
    532438However, it is not the most user-friendly option.
    533 It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricting users to usages that are compatible with how \CFA internally uses @io_uring@.
     439It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricts users to usages that are compatible with how \CFA internally uses @io_uring@.
  • doc/theses/thierry_delisle_PhD/thesis/text/practice.tex

    rebf8ca5 r23a08aa0  
    1616} // delete 4 kernel threads
    1717\end{cfa}
    18 Dynamically allocated processors can be deleted an any time, \ie their lifetime exceeds the block of creation.
     18Dynamically allocated processors can be deleted at any time, \ie their lifetime exceeds the block of creation.
    1919The consequence is that the scheduler and \io subsystems must know when these \procs come in and out of existence and roll them into the appropriate scheduling algorithms.
    2020
    2121\section{Manual Resizing}
    2222Manual resizing is expected to be a rare operation.
    23 Programmers normally create/delete processors on a clusters at startup/teardown.
     23Programmers normally create/delete processors on a cluster at startup/teardown.
    2424Therefore, dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state.
    2525As such, all internal scheduling arrays that are sized based on the number of \procs need to be @realloc@ed.
    26 This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or for any other reason.
     26This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or any other reason.
    2727
    2828There are no performance requirements, within reason, for resizing since it is expected to be rare.
    2929However, this operation has strict correctness requirements since updating and idle sleep can easily lead to deadlocks.
    30 It should also avoid as much as possible any effect on performance when the number of \procs remain constant.
     30It should also avoid as much as possible any effect on performance when the number of \procs remains constant.
    3131This later requirement prohibits naive solutions, like simply adding a global lock to the ready-queue arrays.
    3232
    3333\subsection{Read-Copy-Update}
    3434One solution is to use the Read-Copy-Update pattern~\cite{wiki:rcu}.
    35 In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempt an Indiana Jones Switch to replace the original with the copy.
     35In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempting an Indiana Jones Switch to replace the original with the copy.
    3636This approach has the advantage that it may not need any synchronization to do the switch.
    3737However, there is a race where \procs still use the original data structure after the copy is switched.
    38 This race not only requires adding a memory-reclamation scheme, it also requires that operations made on the stale original version are eventually moved to the copy.
     38This race not only requires adding a memory-reclamation scheme, but it also requires that operations made on the stale original version are eventually moved to the copy.
    3939
    4040Specifically, the original data structure must be kept until all \procs have witnessed the change.
     
    4242If all operations need synchronization, then the overall cost of this technique is likely to be similar to an uncontended lock approach.
    4343In addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges.
    44 Especially merging subqueues while having a minimal impact on fairness and locality.
    45 
    46 For example, given a linked-list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure.
     44Especially merging sub-queues while having a minimal impact on fairness and locality.
     45
     46For example, given a linked list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure.
    4747If the list supports arbitrary insertions, then inconsistencies in the tail pointer do not break the list;
    4848however, ordering may not be preserved.
     
    5555A simpler approach is to use a \newterm{Readers-Writer Lock}~\cite{wiki:rwlock}, where the resizing requires acquiring the lock as a writer while simply enqueueing/dequeuing \ats requires acquiring the lock as a reader.
    5656Using a Readers-Writer lock solves the problem of dynamically resizing and leaves the challenge of finding or building a lock with sufficient good read-side performance.
    57 Since this approach is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.
     57Since this approach is not a very complex challenge and an ad hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.
    5858
    5959To maximize reader scalability, readers should not contend with each other when attempting to acquire and release a critical section.
    60 To achieve this goal requires each reader to have its own memory to mark as locked and unlocked.
    61 The read acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock.
    62 The write acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks.
    63 Acquiring all the local read locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer.
     60Achieving this goal requires that each reader have its own memory to mark as locked and unlocked.
     61The read-acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock.
     62The writer acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks.
     63Acquiring all the local read-locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer.
    6464Figure~\ref{f:SpecializedReadersWriterLock} shows the outline for this specialized readers-writer lock.
    6565The lock in nonblocking, so both readers and writers spin while the lock is held.
    66 This very wide sharding strategy means that readers have very good locality, since they only ever need to access two memory location.
     66This very wide sharding strategy means that readers have very good locality since they only ever need to access two memory locations.
    6767
    6868\begin{figure}
     
    9898\section{Idle-Sleep}\label{idlesleep}
    9999While manual resizing of \procs is expected to be rare, the number of \ats can vary significantly over an application's lifetime, which means there are times when there are too few or too many \procs.
    100 For this work, it is the programer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue.
     100For this work, it is the programmer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue.
    101101This leaves too many \procs when there are not enough \ats for all the \procs to be useful.
    102102These idle \procs cannot be removed because their lifetime is controlled by the application, and only the application knows when the number of \ats may increase or decrease.
     
    108108Because idle sleep is spurious, this data structure has strict performance requirements, in addition to strict correctness requirements.
    109109Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ on a pthread semaphore.
    110 The complexity here is to support \at parking and unparking, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity.
     110The complexity here is to support \at \glslink{atblock}{parking} and \glslink{atsched}{unparking}, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity.
    111111Finally, the scheduler needs a heuristic to determine when to block and unblock an appropriate number of \procs.
    112112However, this third challenge is outside the scope of this thesis because developing a general heuristic is complex enough to justify its own work.
    113113Therefore, the \CFA scheduler simply follows the ``Race-to-Idle''~\cite{Albers12} approach where a sleeping \proc is woken any time a \at becomes ready and \procs go to idle sleep anytime they run out of work.
    114114
    115 An interesting sub-part of this heuristic is what to do with bursts of \ats that become ready.
    116 Since waking up a sleeping \proc can have notable latency, it is possible multiple \ats become ready while a single \proc is waking up.
    117 This facts begs the question, if many \procs are available, how many should be woken?
    118 If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelisation.
    119 If the ready \ats will run for a short very short time, waking many \procs may be wasteful.
     115An interesting subpart of this heuristic is what to do with bursts of \ats that become ready.
     116Since waking up a sleeping \proc can have notable latency, multiple \ats may become ready while a single \proc is waking up.
     117This fact begs the question, if many \procs are available, how many should be woken?
     118If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelization.
     119If the ready \ats will run for a very short time, waking many \procs may be wasteful.
    120120As mentioned, a heuristic to handle these complex cases is outside the scope of this thesis, the behaviour of the scheduler in this particular case is left unspecified.
    121121
    122122\section{Sleeping}
    123 As usual, the corner-stone of any feature related to the kernel is the choice of system call.
     123As usual, the cornerstone of any feature related to the kernel is the choice of system call.
    124124In terms of blocking a \gls{kthrd} until some event occurs, the Linux kernel has many available options.
    125125
    126126\subsection{\lstinline{pthread_mutex}/\lstinline{pthread_cond}}
    127 The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe park/unpark of a \gls{kthrd} to/from a @pthread_cond@.
     127The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe \park/\unpark of a \gls{kthrd} to/from a @pthread_cond@.
    128128While this approach works for \glspl{kthrd} waiting among themselves, \io operations do not provide a mechanism to signal @pthread_cond@s.
    129 For \io results to wake a \proc waiting on a @pthread_cond@ means a different \glspl{kthrd} must be woken up first, which then signals the \proc.
     129For \io results to wake a \proc waiting on a @pthread_cond@ means a different \gls{kthrd} must be woken up first, which then signals the \proc.
    130130
    131131\subsection{\lstinline{io_uring} and Epoll}
     
    137137
    138138\subsection{Event FDs}
    139 Another interesting approach is to use an event file descriptor\cite{eventfd}.
     139Another interesting approach is to use an event file descriptor\cite{MAN:eventfd}.
    140140This Linux feature is a file descriptor that behaves like \io, \ie, uses @read@ and @write@, but also behaves like a semaphore.
    141 Indeed, all reads and writes must use a word-sized values, \ie 64 or 32 bits.
    142 Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero out the buffer and return the buffer values so far.\footnote{
     141Indeed, all reads and writes must use word-sized values, \ie 64 or 32 bits.
     142Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero-out the buffer and return the buffer values so far.\footnote{
    143143This behaviour is without the \lstinline{EFD_SEMAPHORE} flag, which changes the behaviour of \lstinline{read} but is not needed for this work.}
    144144If a read is made while the buffer is already 0, the read blocks until a non-0 value is added.
     
    148148
    149149\section{Tracking Sleepers}
    150 Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc.
     150Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly, it requires a concurrent \emph{handshake} so that no \at is stranded on a ready queue with no active \proc.
    151151The classic challenge occurs when a \at is made ready while a \proc is going to sleep: there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at.
    152152Since \ats can be made ready by timers, \io operations, or other events outside a cluster, this race can occur even if the \proc going to sleep is the only \proc awake.
     
    154154
    155155The handshake closing the race is done with both the notifier and the idle \proc executing two ordered steps.
    156 The notifier first make sure the newly ready \at is visible to \procs searching for \ats, and then attempt to notify an idle \proc.
     156The notifier first makes sure the newly ready \at is visible to \procs searching for \ats, and then attempts to notify an idle \proc.
    157157On the other side, \procs make themselves visible as idle \procs and then search for any \ats they may have missed.
    158158Unlike regular work-stealing, this search must be exhaustive to make sure that pre-existing \at is missed.
    159159These steps from both sides guarantee that if the search misses a newly ready \at, then the notifier is guaranteed to see at least one idle \proc.
    160 Conversly, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search.
     160Conversely, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search.
    161161
    162162Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers.
    163 Contention can be tolerated for \procs attempting to sleep or wake-up because these \procs are not doing useful work, and therefore, not contributing to overall performance.
     163Contention can be tolerated for \procs attempting to sleep or wake up because these \procs are not doing useful work, and therefore, not contributing to overall performance.
    164164However, notifying, checking if a \proc must be woken-up, and doing so if needed, can significantly affect overall performance and must be low cost.
    165165
    166166\subsection{Sleepers List}
    167167Each cluster maintains a list of idle \procs, organized as a stack.
    168 This ordering allows \procs at the tail to stay in idle sleep for extended period of times while those at the head of the list wake-up for bursts of activity.
     168This ordering allows \procs at the tail to stay in idle sleep for extended periods while those at the head of the list wake up for bursts of activity.
    169169Because of unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \procs handle as much of the work as possible.
    170170The idle \procs maintain the stack of sleepers among themselves and notifying a sleeping \proc takes as little work as possible.
     
    173173
    174174This approach also simplifies notification.
    175 Indeed, \procs not only need to be notify when a new \at is readied, but also must be notified during manual resizing, so the \gls{kthrd} can be joined.
     175Indeed, \procs not only need to be notified when a new \at is readied, but must also be notified during manual resizing, so the \gls{kthrd} can be joined.
    176176These requirements mean whichever entity removes idle \procs from the sleeper list must be able to do so in any order.
    177177Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure.
    178 The single lock also means the notification process simply needs to wake-up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest.
     178The single lock also means the notification process simply needs to wake up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest.
    179179
    180180\subsection{Reducing Latency}
    181 As mentioned in this section, \procs going to sleep for extremely short periods of time is likely in certain scenarios.
    182 Therefore, the latency of doing a system call to read from and writing to an event @fd@ can negatively affect overall performance in a notable way.
     181As mentioned in this section, \procs going to sleep for extremely short periods is likely in certain scenarios.
     182Therefore, the latency of doing a system call to read from and write to an event @fd@ can negatively affect overall performance notably.
    183183Hence, it is important to reduce latency and contention of the notification as much as possible.
    184184Figure~\ref{fig:idle1} shows the basic idle-sleep data structure.
     
    205205The woken \proc then updates the atomic pointer, while it is updating the head of the list, as it removes itself from the list.
    206206Notifiers that obtained a @NULL@ in the exchange simply move on knowing that another notifier is already waking a \proc.
    207 This behaviour is equivalent to having multiple notifier write to the @fd@ since reads consume all previous writes.
    208 Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares witht the latency of \procs waking up.
     207This behaviour is equivalent to having multiple notifiers write to the @fd@ since reads consume all previous writes.
     208Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares with the latency of \procs waking up.
    209209As mentioned in section~\ref{idlesleep}, there is no optimal approach to handle these bursts.
    210210It is therefore difficult to justify the cost of any extra synchronization here.
     
    218218
    219219The next optimization is to avoid the latency of the event @fd@, which can be done by adding what is effectively a binary benaphore\cite{schillings1996engineering} in front of the event @fd@.
    220 The benaphore over the event @fd@ logically provides a three state flag to avoid unnecessary system calls, where the states are expressed explicit in Figure~\ref{fig:idle:state}.
    221 A \proc begins its idle sleep by adding itself to the idle list before searching for an \at.
     220The benaphore over the event @fd@ logically provides a three-state flag to avoid unnecessary system calls, where the states are expressed explicitly in Figure~\ref{fig:idle:state}.
     221A \proc begins its idle sleep by adding itself to the idle list before searching for a \at.
    222222In the process of adding itself to the idle list, it sets the state flag to @SEARCH@.
    223223If no \ats can be found during the search, the \proc then confirms it is going to sleep by atomically swapping the state to @SLEEP@.
    224224If the previous state is still @SEARCH@, then the \proc does read the event @fd@.
    225 Meanwhile, notifiers atomically exchange the state to @AWAKE@ state.
     225Meanwhile, notifiers atomically exchange the state to the @AWAKE@ state.
    226226If the previous state is @SLEEP@, then the notifier must write to the event @fd@.
    227227However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event @fd@ can be omitted, which reduces latency notably.
    228 These extensions leads to the final data structure shown in Figure~\ref{fig:idle}.
     228These extensions lead to the final data structure shown in Figure~\ref{fig:idle}.
    229229
    230230\begin{figure}
    231231        \centering
    232232        \input{idle_state.pstex_t}
    233         \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three state flag is added to the event \lstinline{fd}.}
     233        \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three-state flag is added to the event \lstinline{fd}.}
    234234        \label{fig:idle:state}
    235235\end{figure}
  • doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex

    rebf8ca5 r23a08aa0  
    44\section{C Threading}
    55
    6 \Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@.
     6\Celeven introduced threading features, such as the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@.
    77Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC).
    88While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC.
     
    1313\section{M:N Threading}\label{prev:model}
    1414
    15 Threading in \CFA is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \glspl{thrd} and switch among \glspl{thrd} liberally without many concerns for performance.
     15Threading in \CFA is based on \Gls{uthrding}, where \ats are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \ats and switch among \ats liberally without many performance concerns.
    1616
    17 The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}.
     17The \CFA M:N threading model is implemented using many user-level threads mapped onto fewer \glspl{kthrd}.
    1818The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack.
    19 The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run.
     19The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \at until it context switches out, it then chooses a different \at to run.
    2020
    2121\section{Clusters}
    2222\CFA allows the option to group user-level threading, in the form of clusters.
    23 Both \glspl{thrd} and \glspl{proc} belong to a specific cluster.
    24 \Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters.
     23Both \ats and \glspl{proc} belong to a specific cluster.
     24\Glspl{at} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters.
    2525Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism.
    2626It also opens the door to handling effects like NUMA, by pinning clusters to a specific NUMA node\footnote{This capability is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for CPU masks.}.
     
    3030                \input{system.pstex_t}
    3131        \end{center}
    32         \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} that lives outside any cluster and does not run \glspl{thrd}.}
     32        \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{at} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} that lives outside any cluster and does not run \ats.}
    3333        \label{fig:system}
    3434\end{figure}
     
    3838
    3939\section{\glsxtrshort{io}}\label{prev:io}
    40 Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. While all \glsxtrshort{io} operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means \glsxtrshort{io} operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
     40Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. While all \glsxtrshort{io} operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means \glsxtrshort{io} operations block \glspl{proc} instead of \ats. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \ats. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \at is ready to run. A simple example of this type of deadlock would be as follows:
    4141
    4242\begin{quote}
    43 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server.
    44 If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet.
    45 In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request.
    46 With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}.
    47 If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client.
     43Given a simple network program with 2 \ats and a single \gls{proc}, one \at sends network requests to a server and the other \at waits for a response from the server.
     44If the second \at races ahead, it may wait for responses to requests that have not been sent yet.
     45In theory, this should not be a problem, even if the second \at waits, because the first \at is still ready to run and should be able to get CPU time to send the request.
     46With M:N threading, while the first \at is ready, the lone \gls{proc} \emph{cannot} run the first \at if it is blocked in the \glsxtrshort{io} operation of the second \at.
     47If this happens, the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client.
    4848However, this solution is neither general nor appropriate even in this simple case.}.
    4949\end{quote}
    5050
    51 Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} ope      rations.
    52 This feature entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}.
     51Therefore, one of the objectives of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \ats rather than \glspl{proc} when doing \glsxtrshort{io} operations.
     52This feature entails multiplexing the \glsxtrshort{io} operations of many \ats onto fewer \glspl{proc}.
    5353The multiplexing requires a single \gls{proc} to execute multiple \glsxtrshort{io} operations in parallel.
    5454This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration.
     
    6060All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions need not be thread-safe. ... (list of 70+ excluded functions)
    6161\end{quote}
    62 Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthreads lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
     62Only UNIX @man@ pages identify whether a library function is thread-safe, and hence, may block on a pthreads lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
    6363
    6464Languages like Go and Java, which have strict interoperability with C\cite{wiki:jni,go:cgo}, can control operations in C by ``sandboxing'' them, \eg a blocking function may be delegated to a \gls{kthrd}. Sandboxing may help towards guaranteeing that the kind of deadlock mentioned above does not occur.
     
    7272Therefore, it is possible calls to an unknown library function can block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model.
    7373Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications.
    74 Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks are redirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.}
     74Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks is redirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.}
  • doc/theses/thierry_delisle_PhD/thesis/thesis.tex

    rebf8ca5 r23a08aa0  
    8484\usepackage{subcaption}
    8585\usepackage{comment} % Removes large sections of the document.
     86\usepackage{array}
    8687
    8788% Hyperlinks make it very easy to navigate an electronic document.
     
    210211\newcommand\proc{\gls{proc}\xspace}%
    211212\newcommand\procs{\glspl{proc}\xspace}%
     213\newcommand\park{\glslink{atblock}{park}\xspace}%
     214\newcommand\unpark{\glslink{atsched}{unpark}\xspace}%
    212215
    213216%======================================================================
  • driver/Makefile.am

    rebf8ca5 r23a08aa0  
    1919
    2020# applies to both programs
    21 AM_CXXFLAGS = @HOST_FLAGS@ -Wall -Wextra -Werror=return-type -O2 -g -std=c++14 -I${abs_top_srcdir}/src -I${abs_top_srcdir}/src/include
     21AM_CXXFLAGS = @HOST_FLAGS@ -Wall -Wextra -Werror=return-type -O2 -g -std=c++17 -I${abs_top_srcdir}/src -I${abs_top_srcdir}/src/include
    2222
    2323# don't install cfa directly
  • driver/cfa.cc

    rebf8ca5 r23a08aa0  
    5353        return arg.substr( 0, pre.size() ) == pre;
    5454} // prefix
    55 
    56 static inline bool ends_with(const string & str, const string & sfix) {
    57         if (sfix.size() > str.size()) return false;
    58         return std::equal(str.rbegin(), str.rbegin() + sfix.size(), sfix.rbegin(), sfix.rend());
    59 }
    6055
    6156// check if string has suffix
  • libcfa/prelude/Makefile.am

    rebf8ca5 r23a08aa0  
    5050
    5151prelude.cfa : prelude-gen.cc
    52         ${AM_V_GEN}${CXX} ${AM_CXXFLAGS} ${CXXFLAGS} ${AM_CFLAGS} ${<} -o prelude-gen -Wall -Wextra -O2 -g -std=c++14
     52        ${AM_V_GEN}${CXX} ${AM_CXXFLAGS} ${CXXFLAGS} ${AM_CFLAGS} ${<} -o prelude-gen -Wall -Wextra -O2 -g -std=c++17
    5353        @./prelude-gen > ${@}
    5454        @rm ./prelude-gen
     
    7676
    7777if ENABLE_DISTCC
    78 distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c $(srcdir)/../../tools/build/push2dist.sh
     78distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ defines.hfa gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c $(srcdir)/../../tools/build/push2dist.sh
    7979        ${AM_V_GEN}$(srcdir)/../../tools/build/push2dist.sh @CFADIR_HASH@ @DIST_BWLIMIT@
    8080        @echo "Dummy file to track distribution to remote hosts" > ${@}
  • libcfa/prelude/defines.hfa.in

    rebf8ca5 r23a08aa0  
    141141
    142142/* Defined if io_uring support is present when compiling libcfathread and
     143   supports the flag IORING_REGISTER_IOWQ_MAX_WORKERS. */
     144#undef CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS
     145
     146/* Defined if io_uring support is present when compiling libcfathread and
    143147   supports the flag IORING_SETUP_ATTACH_WQ. */
    144148#undef CFA_HAVE_IORING_SETUP_ATTACH_WQ
  • libcfa/src/Makefile.am

    rebf8ca5 r23a08aa0  
    186186if ENABLE_DISTCC
    187187
    188 ../prelude/distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ ../prelude/gcc-builtins.cf ../prelude/builtins.cf ../prelude/extras.cf ../prelude/prelude.cfa ../prelude/bootloader.c $(srcdir)/../../tools/build/push2dist.sh
     188../prelude/distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ ../prelude/defines.hfa ../prelude/gcc-builtins.cf ../prelude/builtins.cf ../prelude/extras.cf ../prelude/prelude.cfa ../prelude/bootloader.c $(srcdir)/../../tools/build/push2dist.sh
    189189        @+make -C ../prelude distribution
    190190
  • libcfa/src/bits/defs.hfa

    rebf8ca5 r23a08aa0  
    2424#define likely(x)   __builtin_expect(!!(x), 1)
    2525#define unlikely(x) __builtin_expect(!!(x), 0)
    26 #define thread_local _Thread_local
    2726
    2827typedef void (*fptr_t)();
     
    3736#endif
    3837
     38
     39#if defined(__has_attribute)
     40#if !__has_attribute(__noclone__)
     41#define ATTRIBUTE_NOCLONE
     42#endif
     43#endif
     44#ifndef ATTRIBUTE_NOCLONE
     45#define ATTRIBUTE_NOCLONE __attribute__((__noclone__))
     46#endif
     47
    3948#define libcfa_public __attribute__((visibility("default")))
     49#define libcfa_nopreempt __attribute__((section("cfatext_nopreempt"))) __attribute__((__noinline__)) ATTRIBUTE_NOCLONE
     50
     51struct __cfa_nopreempt_region {
     52        void * start;
     53        void * stop;
     54};
    4055
    4156#ifdef __cforall
  • libcfa/src/bits/locks.hfa

    rebf8ca5 r23a08aa0  
    1313// Created On       : Tue Oct 31 15:14:38 2017
    1414// Last Modified By : Peter A. Buhr
    15 // Last Modified On : Sat Aug 27 15:06:39 2022
    16 // Update Count     : 15
     15// Last Modified On : Mon Sep 19 18:39:45 2022
     16// Update Count     : 16
    1717//
    1818
  • libcfa/src/concurrency/io/call.cfa.in

    rebf8ca5 r23a08aa0  
    202202                struct io_context$ * ctx = cfa_io_allocate( &sqe, &idx, 1 );
    203203
     204                memset(sqe, 0, sizeof(*sqe));
    204205                sqe->opcode = IORING_OP_{op};
     206                sqe->flags = sflags;
    205207                sqe->user_data = (uintptr_t)&future;
    206                 sqe->flags = sflags;
    207                 sqe->ioprio = 0;
    208                 sqe->fd = 0;
    209                 sqe->off = 0;
    210                 sqe->addr = 0;
    211                 sqe->len = 0;
    212                 sqe->fsync_flags = 0;
    213                 sqe->__pad2[0] = 0;
    214                 sqe->__pad2[1] = 0;
    215                 sqe->__pad2[2] = 0;{body}
     208                {body}
    216209
    217210                asm volatile("": : :"memory");
  • libcfa/src/concurrency/io/setup.cfa

    rebf8ca5 r23a08aa0  
    228228
    229229                #if !defined(CFA_WITH_IO_URING_IDLE)
     230                {
    230231                        // Step 4 : eventfd
    231232                        __cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd);
     
    237238
    238239                        __cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd);
    239                 #endif
    240 
     240                }
     241                #endif
     242
     243                // TODO: implement a proper version of this.
     244                // I have not found a better maximum that works in general but users should be able to configure it
     245                // the same way they configure other I/O options
    241246                // #if defined(CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS)
     247                // {
    242248                //      // Step 5 : max worker count
    243249                //      __cfadbg_print_safe(io_core, "Kernel I/O : lmiting max workers for ring %d\n", fd);
     
    252258
    253259                //      __cfadbg_print_safe(io_core, "Kernel I/O : lmited max workers for ring %d\n", fd);
     260                // }
    254261                // #endif
    255262
  • libcfa/src/concurrency/kernel/cluster.hfa

    rebf8ca5 r23a08aa0  
    6363                }
    6464        }
    65         return (max + 2 * max) / 2;
     65        return 8 * max;
    6666}
    6767
  • libcfa/src/concurrency/kernel/fwd.hfa

    rebf8ca5 r23a08aa0  
    3535extern "C" {
    3636        extern "Cforall" {
    37                 extern __attribute__((aligned(64))) thread_local struct KernelThreadData {
     37                extern __attribute__((aligned(64))) __thread struct KernelThreadData {
    3838                        struct thread$          * volatile this_thread;
    3939                        struct processor        * volatile this_processor;
     
    179179                // Similar to a binary semaphore with a 'one shot' semantic
    180180                // is expected to be discarded after each party call their side
     181                enum(struct thread$ *) { oneshot_ARMED = 0p, oneshot_FULFILLED = 1p };
    181182                struct oneshot {
    182183                        // Internal state :
    183                         //     0p     : is initial state (wait will block)
    184                         //     1p     : fulfilled (wait won't block)
     184                        // armed      : initial state, wait will block
     185                        // fulfilled  : wait won't block
    185186                        // any thread : a thread is currently waiting
    186187                        struct thread$ * volatile ptr;
     
    189190                static inline {
    190191                        void  ?{}(oneshot & this) {
    191                                 this.ptr = 0p;
     192                                this.ptr = oneshot_ARMED;
    192193                        }
    193194
     
    199200                                for() {
    200201                                        struct thread$ * expected = this.ptr;
    201                                         if(expected == 1p) return false;
     202                                        if(expected == oneshot_FULFILLED) return false;
    202203                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
    203204                                                park();
    204                                                 /* paranoid */ verify( this.ptr == 1p );
     205                                                /* paranoid */ verify( this.ptr == oneshot_FULFILLED );
    205206                                                return true;
    206207                                        }
     
    211212                        // return true if a thread was unparked
    212213                        thread$ * post(oneshot & this, bool do_unpark = true) {
    213                                 struct thread$ * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
    214                                 if( got == 0p || got == 1p ) return 0p;
     214                                struct thread$ * got = __atomic_exchange_n( &this.ptr, oneshot_FULFILLED, __ATOMIC_SEQ_CST);
     215                                if( got == oneshot_ARMED || got == oneshot_FULFILLED ) return 0p;
    215216                                if(do_unpark) unpark( got );
    216217                                return got;
     
    223224                // thread on "any of" [a given set of] futures.
    224225                // does not support multiple threads waiting on the same future
     226                enum(struct oneshot *) { future_ARMED = 0p, future_FULFILLED = 1p, future_PROGRESS = 2p, future_ABANDONED = 3p };
    225227                struct future_t {
    226228                        // Internal state :
    227                         //     0p      : is initial state (wait will block)
    228                         //     1p      : fulfilled (wait won't block)
    229                         //     2p      : in progress ()
    230                         //     3p      : abandoned, server should delete
     229                        // armed       : initial state, wait will block
     230                        // fulfilled   : result is ready, wait won't block
     231                        // progress    : someone else is in the process of fulfilling this
     232                        // abandoned   : client no longer cares, server should delete
    231233                        // any oneshot : a context has been setup to wait, a thread could wait on it
    232234                        struct oneshot * volatile ptr;
     
    235237                static inline {
    236238                        void  ?{}(future_t & this) {
    237                                 this.ptr = 0p;
     239                                this.ptr = future_ARMED;
    238240                        }
    239241
     
    242244                        void reset(future_t & this) {
    243245                                // needs to be in 0p or 1p
    244                                 __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
     246                                __atomic_exchange_n( &this.ptr, future_ARMED, __ATOMIC_SEQ_CST);
    245247                        }
    246248
    247249                        // check if the future is available
    248250                        bool available( future_t & this ) {
    249                                 while( this.ptr == 2p ) Pause();
    250                                 return this.ptr == 1p;
     251                                while( this.ptr == future_PROGRESS ) Pause();
     252                                return this.ptr == future_FULFILLED;
    251253                        }
    252254
     
    254256                        // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
    255257                        bool setup( future_t & this, oneshot & wait_ctx ) {
    256                                 /* paranoid */ verify( wait_ctx.ptr == 0p || wait_ctx.ptr == 1p );
     258                                /* paranoid */ verify( wait_ctx.ptr == oneshot_ARMED || wait_ctx.ptr == oneshot_FULFILLED );
    257259                                // The future needs to set the wait context
    258260                                for() {
    259261                                        struct oneshot * expected = this.ptr;
    260262                                        // Is the future already fulfilled?
    261                                         if(expected == 1p) return false; // Yes, just return false (didn't block)
     263                                        if(expected == future_FULFILLED) return false; // Yes, just return false (didn't block)
    262264
    263265                                        // The future is not fulfilled, try to setup the wait context
     
    277279
    278280                                // attempt to remove the context so it doesn't get consumed.
    279                                 if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
     281                                if(__atomic_compare_exchange_n( &this.ptr, &expected, future_ARMED, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
    280282                                        // we still have the original context, then no one else saw it
    281283                                        return false;
    282284                                }
    283285
    284                                 // expected == 0p: future was never actually setup, just return
    285                                 if( expected == 0p ) return false;
    286 
    287                                 // expected == 1p: the future is ready and the context was fully consumed
     286                                // expected == ARMED: future was never actually setup, just return
     287                                if( expected == future_ARMED ) return false;
     288
     289                                // expected == FULFILLED: the future is ready and the context was fully consumed
    288290                                // the server won't use the pointer again
    289291                                // It is safe to delete (which could happen after the return)
    290                                 if( expected == 1p ) return true;
    291 
    292                                 // expected == 2p: the future is ready but the context hasn't fully been consumed
     292                                if( expected == future_FULFILLED ) return true;
     293
     294                                // expected == PROGRESS: the future is ready but the context hasn't fully been consumed
    293295                                // spin until it is safe to move on
    294                                 if( expected == 2p ) {
    295                                         while( this.ptr != 1p ) Pause();
    296                                         /* paranoid */ verify( this.ptr == 1p );
     296                                if( expected == future_PROGRESS ) {
     297                                        while( this.ptr != future_FULFILLED ) Pause();
     298                                        /* paranoid */ verify( this.ptr == future_FULFILLED );
    297299                                        return true;
    298300                                }
     
    305307                        // Mark the future as abandoned, meaning it will be deleted by the server
    306308                        bool abandon( future_t & this ) {
    307                                 /* paranoid */ verify( this.ptr != 3p );
     309                                /* paranoid */ verify( this.ptr != future_ABANDONED );
    308310
    309311                                // Mark the future as abandonned
    310                                 struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
     312                                struct oneshot * got = __atomic_exchange_n( &this.ptr, future_ABANDONED, __ATOMIC_SEQ_CST);
    311313
    312314                                // If the future isn't already fulfilled, let the server delete it
    313                                 if( got == 0p ) return false;
    314 
    315                                 // got == 2p: the future is ready but the context hasn't fully been consumed
     315                                if( got == future_ARMED ) return false;
     316
     317                                // got == PROGRESS: the future is ready but the context hasn't fully been consumed
    316318                                // spin until it is safe to move on
    317                                 if( got == 2p ) {
    318                                         while( this.ptr != 1p ) Pause();
    319                                         got = 1p;
     319                                if( got == future_PROGRESS ) {
     320                                        while( this.ptr != future_FULFILLED ) Pause();
     321                                        got = future_FULFILLED;
    320322                                }
    321323
    322324                                // The future is completed delete it now
    323                                 /* paranoid */ verify( this.ptr != 1p );
     325                                /* paranoid */ verify( this.ptr != future_FULFILLED );
    324326                                free( &this );
    325327                                return true;
     
    336338                                                #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
    337339                                        #endif
    338                                                 if( expected == 3p ) { free( &this ); return 0p; }
     340                                                if( expected == future_ABANDONED ) { free( &this ); return 0p; }
    339341                                        #if defined(__GNUC__) && __GNUC__ >= 7
    340342                                                #pragma GCC diagnostic pop
    341343                                        #endif
    342344
    343                                         /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
    344                                         /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
     345                                        /* paranoid */ verify( expected != future_FULFILLED ); // Future is already fulfilled, should not happen
     346                                        /* paranoid */ verify( expected != future_PROGRESS ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
    345347
    346348                                        // If there is a wait context, we need to consume it and mark it as consumed after
    347349                                        // If there is no context then we can skip the in progress phase
    348                                         struct oneshot * want = expected == 0p ? 1p : 2p;
     350                                        struct oneshot * want = expected == future_ARMED ? future_FULFILLED : future_PROGRESS;
    349351                                        if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
    350                                                 if( expected == 0p ) { return 0p; }
     352                                                if( expected == future_ARMED ) { return 0p; }
    351353                                                thread$ * ret = post( *expected, do_unpark );
    352                                                 __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
     354                                                __atomic_store_n( &this.ptr, future_FULFILLED, __ATOMIC_SEQ_CST);
    353355                                                return ret;
    354356                                        }
     
    366368
    367369                                // Wait for the future to tru
    368                                 while( this.ptr == 2p ) Pause();
     370                                while( this.ptr == future_PROGRESS ) Pause();
    369371                                // Make sure the state makes sense
    370372                                // Should be fulfilled, could be in progress but it's out of date if so
     
    372374                                // and the oneshot should not be needed any more
    373375                                __attribute__((unused)) struct oneshot * was = this.ptr;
    374                                 /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
     376                                /* paranoid */ verifyf( was == future_FULFILLED, "Expected this.ptr to be 1p, was %p\n", was );
    375377
    376378                                // Mark the future as fulfilled, to be consistent
  • libcfa/src/concurrency/kernel/private.hfa

    rebf8ca5 r23a08aa0  
    8888#elif defined(CFA_HAVE_LINUX_RSEQ_H)
    8989        extern "Cforall" {
    90                 extern __attribute__((aligned(64))) thread_local volatile struct rseq __cfaabi_rseq;
     90                extern __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq;
    9191        }
    9292#else
     
    161161// Blocking acquire
    162162static inline void __atomic_acquire(volatile bool * ll) {
     163        /* paranoid */ verify( ! __preemption_enabled() );
     164        /* paranoid */ verify(ll);
     165
    163166        while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
    164167                while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
     
    166169        }
    167170        /* paranoid */ verify(*ll);
     171        /* paranoid */ verify( ! __preemption_enabled() );
    168172}
    169173
    170174// Non-Blocking acquire
    171175static inline bool __atomic_try_acquire(volatile bool * ll) {
     176        /* paranoid */ verify( ! __preemption_enabled() );
     177        /* paranoid */ verify(ll);
     178
    172179        return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
    173180}
     
    175182// Release
    176183static inline void __atomic_unlock(volatile bool * ll) {
     184        /* paranoid */ verify( ! __preemption_enabled() );
     185        /* paranoid */ verify(ll);
    177186        /* paranoid */ verify(*ll);
    178187        __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);
  • libcfa/src/concurrency/kernel/startup.cfa

    rebf8ca5 r23a08aa0  
    133133//-----------------------------------------------------------------------------
    134134// Global state
    135 thread_local struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {
     135__thread struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {
    136136        NULL,                                                                                           // cannot use 0p
    137137        NULL,
     
    153153#elif defined(CFA_HAVE_LINUX_RSEQ_H)
    154154        extern "Cforall" {
    155                 __attribute__((aligned(64))) thread_local volatile struct rseq __cfaabi_rseq @= {
     155                __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq @= {
    156156                        .cpu_id : RSEQ_CPU_ID_UNINITIALIZED,
    157157                };
  • libcfa/src/concurrency/preemption.cfa

    rebf8ca5 r23a08aa0  
    238238//----------
    239239// special case for preemption since used often
    240 __attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() libcfa_public {
     240__attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() libcfa_nopreempt libcfa_public {
    241241        // create a assembler label before
    242242        // marked as clobber all to avoid movement
     
    272272}
    273273
     274extern "C" {
     275        __attribute__((visibility("hidden"))) extern void * const __start_cfatext_nopreempt;
     276        __attribute__((visibility("hidden"))) extern void * const __stop_cfatext_nopreempt;
     277
     278        extern const __cfa_nopreempt_region __libcfa_nopreempt;
     279        __attribute__((visibility("protected"))) const __cfa_nopreempt_region __libcfathrd_nopreempt @= {
     280                (void * const)&__start_cfatext_nopreempt,
     281                (void * const)&__stop_cfatext_nopreempt
     282        };
     283}
     284
     285static inline bool __cfaabi_in( void * const ip, const struct __cfa_nopreempt_region & const region ) {
     286        return ip >= region.start && ip <= region.stop;
     287}
     288
    274289
    275290//----------
    276291// Get data from the TLS block
    277292// struct asm_region __cfaasm_get;
    278 uintptr_t __cfatls_get( unsigned long int offset ) __attribute__((__noinline__, visibility("default"))); //no inline to avoid problems
     293uintptr_t __cfatls_get( unsigned long int offset ) libcfa_nopreempt libcfa_public; //no inline to avoid problems
    279294uintptr_t __cfatls_get( unsigned long int offset ) {
    280295        // create a assembler label before
     
    295310extern "C" {
    296311        // Disable interrupts by incrementing the counter
    297         __attribute__((__noinline__, visibility("default"))) void disable_interrupts() libcfa_public {
     312        void disable_interrupts() libcfa_nopreempt libcfa_public {
    298313                // create a assembler label before
    299314                // marked as clobber all to avoid movement
     
    326341        // Enable interrupts by decrementing the counter
    327342        // If counter reaches 0, execute any pending __cfactx_switch
    328         void enable_interrupts( bool poll ) libcfa_public {
     343        void enable_interrupts( bool poll ) libcfa_nopreempt libcfa_public {
    329344                // Cache the processor now since interrupts can start happening after the atomic store
    330345                processor   * proc = __cfaabi_tls.this_processor;
     
    358373                }
    359374        }
     375
     376        // Check whether or not there is pending preemption
     377        // force_yield( __POLL_PREEMPTION ) if appropriate
     378        // return true if the thread was in an interruptable state
     379        // i.e. on a real processor and not in the kernel
     380        // (can return true even if no preemption was pending)
     381        bool poll_interrupts() libcfa_public {
     382                // Cache the processor now since interrupts can start happening after the atomic store
     383                processor   * proc = publicTLS_get( this_processor );
     384                if ( ! proc ) return false;
     385                if ( ! __preemption_enabled() ) return false;
     386
     387                with( __cfaabi_tls.preemption_state ){
     388                        // Signal the compiler that a fence is needed but only for signal handlers
     389                        __atomic_signal_fence(__ATOMIC_RELEASE);
     390                        if( proc->pending_preemption ) {
     391                                proc->pending_preemption = false;
     392                                force_yield( __POLL_PREEMPTION );
     393                        }
     394                }
     395
     396                return true;
     397        }
    360398}
    361399
     
    463501
    464502//-----------------------------------------------------------------------------
    465 // Some assembly required
    466 #if defined( __i386 )
    467         #ifdef __PIC__
    468                 #define RELOC_PRELUDE( label ) \
    469                         "calll   .Lcfaasm_prelude_" #label "$pb\n\t" \
    470                         ".Lcfaasm_prelude_" #label "$pb:\n\t" \
    471                         "popl    %%eax\n\t" \
    472                         ".Lcfaasm_prelude_" #label "_end:\n\t" \
    473                         "addl    $_GLOBAL_OFFSET_TABLE_+(.Lcfaasm_prelude_" #label "_end-.Lcfaasm_prelude_" #label "$pb), %%eax\n\t"
    474                 #define RELOC_PREFIX ""
    475                 #define RELOC_SUFFIX "@GOT(%%eax)"
    476         #else
    477                 #define RELOC_PREFIX "$"
    478                 #define RELOC_SUFFIX ""
    479         #endif
    480         #define __cfaasm_label( label ) struct asm_region label = \
    481                 ({ \
    482                         struct asm_region region; \
    483                         asm( \
    484                                 RELOC_PRELUDE( label ) \
    485                                 "movl " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
    486                                 "movl " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
    487                                  : [vb]"=r"(region.before), [va]"=r"(region.after) \
    488                         ); \
    489                         region; \
    490                 });
    491 #elif defined( __x86_64 )
    492         #ifdef __PIC__
    493                 #define RELOC_PREFIX ""
    494                 #define RELOC_SUFFIX "@GOTPCREL(%%rip)"
    495         #else
    496                 #define RELOC_PREFIX "$"
    497                 #define RELOC_SUFFIX ""
    498         #endif
    499         #define __cfaasm_label( label ) struct asm_region label = \
    500                 ({ \
    501                         struct asm_region region; \
    502                         asm( \
    503                                 "movq " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
    504                                 "movq " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
    505                                  : [vb]"=r"(region.before), [va]"=r"(region.after) \
    506                         ); \
    507                         region; \
    508                 });
    509 #elif defined( __aarch64__ )
    510         #ifdef __PIC__
    511                 // Note that this works only for gcc
    512                 #define __cfaasm_label( label ) struct asm_region label = \
    513                 ({ \
    514                         struct asm_region region; \
    515                         asm( \
    516                                 "adrp %[vb], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
    517                                 "ldr  %[vb], [%[vb], #:gotpage_lo15:__cfaasm_" #label "_before]" "\n\t" \
    518                                 "adrp %[va], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
    519                                 "ldr  %[va], [%[va], #:gotpage_lo15:__cfaasm_" #label "_after]"  "\n\t" \
    520                                  : [vb]"=r"(region.before), [va]"=r"(region.after) \
    521                         ); \
    522                         region; \
    523                 });
    524         #else
    525                 #error this is not the right thing to do
    526                 /*
    527                 #define __cfaasm_label( label ) struct asm_region label = \
    528                 ({ \
    529                         struct asm_region region; \
    530                         asm( \
    531                                 "adrp %[vb], __cfaasm_" #label "_before"              "\n\t" \
    532                                 "add  %[vb], %[vb], :lo12:__cfaasm_" #label "_before" "\n\t" \
    533                                 "adrp %[va], :got:__cfaasm_" #label "_after"          "\n\t" \
    534                                 "add  %[va], %[va], :lo12:__cfaasm_" #label "_after"  "\n\t" \
    535                                  : [vb]"=r"(region.before), [va]"=r"(region.after) \
    536                         ); \
    537                         region; \
    538                 });
    539                 */
    540         #endif
    541 #else
    542         #error unknown hardware architecture
    543 #endif
    544 
    545503// KERNEL ONLY
    546504// Check if a __cfactx_switch signal handler shoud defer
     
    548506// If false : preemption is unsafe and marked as pending
    549507static inline bool preemption_ready( void * ip ) {
    550         // Get all the region for which it is not safe to preempt
    551         __cfaasm_label( get    );
    552         __cfaasm_label( check  );
    553         __cfaasm_label( dsable );
    554         // __cfaasm_label( debug  );
    555 
    556508        // Check if preemption is safe
    557509        bool ready = true;
    558         if( __cfaasm_in( ip, get    ) ) { ready = false; goto EXIT; };
    559         if( __cfaasm_in( ip, check  ) ) { ready = false; goto EXIT; };
    560         if( __cfaasm_in( ip, dsable ) ) { ready = false; goto EXIT; };
    561         // if( __cfaasm_in( ip, debug  ) ) { ready = false; goto EXIT; };
     510        if( __cfaabi_in( ip, __libcfa_nopreempt ) ) { ready = false; goto EXIT; };
     511        if( __cfaabi_in( ip, __libcfathrd_nopreempt ) ) { ready = false; goto EXIT; };
     512
    562513        if( !__cfaabi_tls.preemption_state.enabled) { ready = false; goto EXIT; };
    563514        if( __cfaabi_tls.preemption_state.in_progress ) { ready = false; goto EXIT; };
     
    643594// Kernel Signal Handlers
    644595//=============================================================================================
    645 __cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
     596__cfaabi_dbg_debug_do( static __thread void * last_interrupt = 0; )
    646597
    647598// Context switch signal handler
  • libcfa/src/startup.cfa

    rebf8ca5 r23a08aa0  
    4141        } // __cfaabi_appready_shutdown
    4242
    43         void disable_interrupts() __attribute__(( weak )) libcfa_public {}
    44         void enable_interrupts() __attribute__(( weak )) libcfa_public {}
     43        void disable_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public {}
     44        void enable_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public {}
     45        bool poll_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public { return false; }
     46
     47        __attribute__((visibility("hidden"))) extern void * const __start_cfatext_nopreempt;
     48        __attribute__((visibility("hidden"))) extern void * const __stop_cfatext_nopreempt;
     49
     50        __attribute__((visibility("protected"))) const __cfa_nopreempt_region __libcfa_nopreempt @= {
     51                (void * const)&__start_cfatext_nopreempt,
     52                (void * const)&__stop_cfatext_nopreempt
     53        };
    4554
    4655
  • src/AST/Decl.hpp

    rebf8ca5 r23a08aa0  
    217217
    218218        /// convenience accessor to match Type::isComplete()
    219         bool isComplete() { return sized; }
     219        bool isComplete() const { return sized; }
    220220
    221221        const Decl * accept( Visitor & v ) const override { return v.visit( this ); }
  • src/AST/DeclReplacer.cpp

    rebf8ca5 r23a08aa0  
    99// Author           : Aaron B. Moss
    1010// Created On       : Wed May 8 13:00:00 2019
    11 // Last Modified By : Aaron B. Moss
    12 // Last Modified On : Wed May 8 13:00:00 2019
    13 // Update Count     : 1
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Thr Sep 15 11:55:00 2022
     13// Update Count     : 2
    1414//
    1515
    1616#include "DeclReplacer.hpp"
     17
    1718#include "Expr.hpp"
     19#include "Pass.hpp"
    1820#include "Type.hpp"
    19 
    20 #include "Pass.hpp"
    2121
    2222namespace ast {
    2323
    2424namespace DeclReplacer {
    25         namespace {
    26                 struct DeclReplacer {
    27                 private:
    28                         const DeclMap & declMap;
    29                         const TypeMap & typeMap;
    30                         bool debug;
    3125
    32                 public:
    33                         DeclReplacer(const DeclMap & declMap, const TypeMap & typeMap, bool debug)
    34                                 : declMap( declMap ), typeMap( typeMap ), debug( debug )
    35                         {}
     26namespace {
     27        struct DeclReplacer {
     28        private:
     29                const DeclMap & declMap;
     30                const TypeMap & typeMap;
     31                bool debug;
    3632
    37                         const ast::VariableExpr * previsit( const ast::VariableExpr * );
    38                         const ast::TypeInstType * previsit( const ast::TypeInstType * );
    39                 };
     33        public:
     34                DeclReplacer( const DeclMap & declMap, const TypeMap & typeMap, bool debug )
     35                        : declMap( declMap ), typeMap( typeMap ), debug( debug )
     36                {}
    4037
    41                 struct VarExprReplacer {
    42                 private:
    43                         const ExprMap & exprMap;
    44                        
    45                 public:
    46                         VarExprReplacer(const ExprMap & exprMap): exprMap (exprMap) {}
     38                const ast::VariableExpr * previsit( const ast::VariableExpr * );
     39                const ast::TypeInstType * previsit( const ast::TypeInstType * );
     40        };
    4741
    48                         const Expr * postvisit (const VariableExpr *);
    49                 };
     42        struct VarExprReplacer {
     43        private:
     44                const ExprMap & exprMap;
     45
     46        public:
     47                VarExprReplacer( const ExprMap & exprMap ) : exprMap( exprMap ) {}
     48
     49                const Expr * postvisit( const VariableExpr * );
     50        };
     51} // namespace
     52
     53const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, const TypeMap & typeMap, bool debug ) {
     54        if(!node) return nullptr;
     55        Pass<DeclReplacer> replacer = { declMap, typeMap, debug };
     56        return node->accept( replacer );
     57}
     58
     59const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, bool debug ) {
     60        TypeMap typeMap;
     61        return replace( node, declMap, typeMap, debug );
     62}
     63
     64const ast::Node * replace( const ast::Node * node, const TypeMap & typeMap, bool debug ) {
     65        DeclMap declMap;
     66        return replace( node, declMap, typeMap, debug );
     67}
     68
     69const ast::Node * replace( const ast::Node * node, const ExprMap & exprMap ) {
     70        Pass<VarExprReplacer> replacer = {exprMap};
     71        return node->accept( replacer );
     72}
     73
     74namespace {
     75        // replace variable with new node from decl map
     76        const ast::VariableExpr * DeclReplacer::previsit( const VariableExpr * varExpr ) {
     77                // xxx - assertions and parameters aren't accounted for in this... (i.e. they aren't inserted into the map when it's made, only DeclStmts are)
     78                if ( !declMap.count( varExpr->var ) ) return varExpr;
     79
     80                auto replacement = declMap.at( varExpr->var );
     81                if ( debug ) {
     82                        std::cerr << "replacing variable reference: "
     83                                << (void*)varExpr->var.get() << " " << varExpr->var
     84                                << " with " << (void*)replacement << " " << replacement
     85                                << std::endl;
     86                }
     87                auto nexpr = mutate(varExpr);
     88                nexpr->var = replacement;
     89                return nexpr;
    5090        }
    5191
    52         const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, const TypeMap & typeMap, bool debug ) {
    53                 if(!node) return nullptr;
    54                 Pass<DeclReplacer> replacer = { declMap, typeMap, debug };
    55                 return node->accept( replacer );
     92        const TypeInstType * DeclReplacer::previsit( const TypeInstType * inst ) {
     93                if ( !typeMap.count( inst->base ) ) return inst;
     94
     95                auto replacement = typeMap.at( inst->base );
     96                if ( debug ) {
     97                        std::cerr << "replacing type reference: "
     98                                << (void*)inst->base.get() << " " << inst->base
     99                                << " with " << (void*)replacement << " " << replacement
     100                                << std::endl;
     101                }
     102                auto ninst = mutate(inst);
     103                ninst->base = replacement;
     104                return ninst;
    56105        }
    57106
    58         const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, bool debug ) {
    59                 TypeMap typeMap;
    60                 return replace( node, declMap, typeMap, debug );
     107        const Expr * VarExprReplacer::postvisit( const VariableExpr * expr ) {
     108                if ( !exprMap.count( expr->var ) ) return expr;
     109                return exprMap.at( expr->var );
    61110        }
     111} // namespace
    62112
    63         const ast::Node * replace( const ast::Node * node, const TypeMap & typeMap, bool debug ) {
    64                 DeclMap declMap;
    65                 return replace( node, declMap, typeMap, debug );
    66         }
     113} // namespace DeclReplacer
    67114
    68         const ast::Node * replace( const ast::Node * node, const ExprMap & exprMap) {
    69                 Pass<VarExprReplacer> replacer = {exprMap};
    70                 return node->accept( replacer );
    71         }
    72 
    73         namespace {
    74                 // replace variable with new node from decl map
    75                 const ast::VariableExpr * DeclReplacer::previsit( const VariableExpr * varExpr ) {
    76                         // xxx - assertions and parameters aren't accounted for in this... (i.e. they aren't inserted into the map when it's made, only DeclStmts are)
    77                         if ( !declMap.count( varExpr->var ) ) return varExpr;
    78 
    79                         auto replacement = declMap.at( varExpr->var );
    80                         if ( debug ) {
    81                                 std::cerr << "replacing variable reference: "
    82                                         << (void*)varExpr->var.get() << " " << varExpr->var
    83                                         << " with " << (void*)replacement << " " << replacement
    84                                         << std::endl;
    85                         }
    86                         auto nexpr = mutate(varExpr);
    87                         nexpr->var = replacement;
    88                         return nexpr;
    89                 }
    90 
    91                 const TypeInstType * DeclReplacer::previsit( const TypeInstType * inst ) {
    92                         if ( !typeMap.count( inst->base ) ) return inst;
    93 
    94                         auto replacement = typeMap.at( inst->base );
    95                         if ( debug ) {
    96                                 std::cerr << "replacing type reference: "
    97                                         << (void*)inst->base.get() << " " << inst->base
    98                                         << " with " << (void*)replacement << " " << replacement
    99                                         << std::endl;
    100                         }
    101                         auto ninst = mutate(inst);
    102                         ninst->base = replacement;
    103                         return ninst;
    104                 }
    105 
    106                 const Expr * VarExprReplacer::postvisit( const VariableExpr * expr ) {
    107                         if (!exprMap.count(expr->var)) return expr;
    108 
    109                         return exprMap.at(expr->var);
    110                 }
    111 
    112         }
    113 }
    114 
    115 }
     115} // namespace ast
    116116
    117117// Local Variables: //
  • src/AST/Pass.hpp

    rebf8ca5 r23a08aa0  
    327327struct PureVisitor {};
    328328
     329struct WithCodeLocation {
     330        const CodeLocation * location = nullptr;
     331};
     332
    329333/// Keep track of the polymorphic const TypeSubstitution * typeSubs for the current expression.
    330334struct WithConstTypeSubstitution {
  • src/AST/Pass.impl.hpp

    rebf8ca5 r23a08aa0  
    2525#define VISIT_START( node ) \
    2626        using namespace ast; \
     27        /* back-up the last known code location */ \
     28        __attribute__((unused)) auto loc_guard = ast::__pass::make_location_guard( core, node, 0 ); \
    2729        /* back-up the visit children */ \
    2830        __attribute__((unused)) ast::__pass::visit_children_guard guard1( ast::__pass::visit_children(core, 0) ); \
  • src/AST/Pass.proto.hpp

    rebf8ca5 r23a08aa0  
    326326        }
    327327
     328        template< typename core_t, typename node_t >
     329        static auto make_location_guard( core_t & core, node_t * node, int )
     330                        -> decltype( node->location, ValueGuardPtr<const CodeLocation *>( &core.location ) ) {
     331                ValueGuardPtr<const CodeLocation *> guard( &core.location );
     332                core.location = &node->location;
     333                return guard;
     334        }
     335
     336        template< typename core_t, typename node_t >
     337        static auto make_location_guard( core_t &, node_t *, long ) -> int {
     338                return 0;
     339        }
     340
    328341        // Another feature of the templated visitor is that it calls beginScope()/endScope() for compound statement.
    329342        // All passes which have such functions are assumed desire this behaviour
  • src/AST/Print.cpp

    rebf8ca5 r23a08aa0  
    3333{
    3434        return array<C,sizeof...(T)>{
    35                 forward<T>(values)...
     35                std::forward<T>(values)...
    3636        };
    3737}
     
    8686
    8787                static constexpr auto StorageClasses = make_array<const char*>(
    88                         "extern", "static", "auto", "register", "_Thread_local"
     88                        "extern", "static", "auto", "register", "__thread", "_Thread_local"
    8989                );
    9090
     
    215215                        ++indent;
    216216                        ptrToEnum->base->accept( *this );
    217                         --indent; 
     217                        --indent;
    218218                }
    219219
     
    16231623// if the wrong size is specified
    16241624constexpr array<const char*, 3> Printer::Names::FuncSpecifiers;
    1625 constexpr array<const char*, 5> Printer::Names::StorageClasses;
     1625constexpr array<const char*, 6> Printer::Names::StorageClasses;
    16261626constexpr array<const char*, 6> Printer::Names::Qualifiers;
    16271627}
  • src/AST/StorageClasses.hpp

    rebf8ca5 r23a08aa0  
    2424        /// Bitflags for storage classes
    2525        enum {
    26                 Extern      = 1 << 0,
    27                 Static      = 1 << 1,
    28                 Auto        = 1 << 2,
    29                 Register    = 1 << 3,
    30                 ThreadLocal = 1 << 4,
    31                 NumClasses       = 5
     26                Extern         = 1 << 0,
     27                Static         = 1 << 1,
     28                Auto           = 1 << 2,
     29                Register       = 1 << 3,
     30                ThreadLocalGcc = 1 << 4,
     31                ThreadLocalC11 = 1 << 5,
     32                NumClasses          = 6
    3233        };
    3334
     
    3738                        unsigned int val;
    3839                        struct {
    39                                 bool is_extern      : 1;
    40                                 bool is_static      : 1;
    41                                 bool is_auto        : 1;
    42                                 bool is_register    : 1;
    43                                 bool is_threadlocal : 1;
     40                                bool is_extern         : 1;
     41                                bool is_static         : 1;
     42                                bool is_auto           : 1;
     43                                bool is_register       : 1;
     44                                bool is_threadlocalGcc : 1;
     45                                bool is_threadlocalC11 : 1;
    4446                        };
    4547
     
    4850
    4951                constexpr class_flags( unsigned int val = 0 ) : val(val) {}
     52
     53                bool is_threadlocal_any() { return this->is_threadlocalC11 || this->is_threadlocalGcc; }
    5054        };
    5155
  • src/AST/Type.cpp

    rebf8ca5 r23a08aa0  
    143143TraitInstType::TraitInstType(
    144144        const TraitDecl * b, CV::Qualifiers q, std::vector<ptr<Attribute>>&& as )
    145 : BaseInstType( b->name, q, move(as) ), base( b ) {}
     145: BaseInstType( b->name, q, std::move(as) ), base( b ) {}
    146146
    147147// --- TypeInstType
     
    149149TypeInstType::TypeInstType( const TypeDecl * b,
    150150        CV::Qualifiers q, std::vector<ptr<Attribute>> && as )
    151 : BaseInstType( b->name, q, move(as) ), base( b ), kind( b->kind ) {}
     151: BaseInstType( b->name, q, std::move(as) ), base( b ), kind( b->kind ) {}
    152152
    153153void TypeInstType::set_base( const TypeDecl * b ) {
     
    161161
    162162TupleType::TupleType( std::vector<ptr<Type>> && ts, CV::Qualifiers q )
    163 : Type( q ), types( move(ts) ), members() {
     163: Type( q ), types( std::move(ts) ), members() {
    164164        // This constructor is awkward. `TupleType` needs to contain objects so that members can be
    165165        // named, but members without initializer nodes end up getting constructors, which breaks
  • src/AST/Type.hpp

    rebf8ca5 r23a08aa0  
    8383template< enum Node::ref_type ref_t >
    8484void reset_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q = {} ) {
    85         if ( p->qualifiers.val != q.val ) p.get_and_mutate()->qualifiers = q;
     85        if ( p->qualifiers != q ) p.get_and_mutate()->qualifiers = q;
    8686}
    8787
     
    8989template< enum Node::ref_type ref_t >
    9090void add_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q ) {
    91         if ( ( p->qualifiers.val & q.val ) != q.val ) p.get_and_mutate()->qualifiers |= q;
     91        if ( ( p->qualifiers & q ) != q ) p.get_and_mutate()->qualifiers |= q;
    9292}
    9393
     
    9595template< enum Node::ref_type ref_t >
    9696void remove_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q ) {
    97         if ( ( p->qualifiers.val & q.val ) != 0 ) p.get_and_mutate()->qualifiers -= q;
     97        if ( ( p->qualifiers & q ) != 0 ) p.get_and_mutate()->qualifiers -= q;
    9898}
    9999
     
    412412                std::string typeString() const { return std::string("_") + std::to_string(formal_usage) + "_" + std::to_string(expr_id) + "_" + base->name; }
    413413                bool operator==(const TypeEnvKey & other) const { return base == other.base && formal_usage == other.formal_usage && expr_id == other.expr_id; }
    414 
    415414        };
    416415
  • src/CodeGen/CodeGenerator.cc

    rebf8ca5 r23a08aa0  
    493493                                        assert( false );
    494494                                } // switch
     495                        } else if( varExpr->get_var()->get_linkage() == LinkageSpec::BuiltinCFA && varExpr->get_var()->get_name() == "intptr" ) {
     496                                // THIS is a hack to make it a constant until a proper constexpr solution is created
     497                                output << "((void*)";
     498                                std::list< Expression* >::iterator arg = applicationExpr->get_args().begin();
     499                                (*arg++)->accept( *visitor );
     500                                output << ")";
    495501                        } else {
    496502                                varExpr->accept( *visitor );
  • src/Common/utility.h

    rebf8ca5 r23a08aa0  
    322322
    323323        ValueGuardPtr(T * inRef) : old( inRef ? *inRef : T() ), ref(inRef) {}
     324        ValueGuardPtr(const ValueGuardPtr& other) = delete;
     325        ValueGuardPtr(ValueGuardPtr&& other) : old(other.old), ref(other.ref) { other.ref = nullptr; }
    324326        ~ValueGuardPtr() { if( ref ) *ref = old; }
    325327};
  • src/CompilationState.cc

    rebf8ca5 r23a08aa0  
    3131        genproto = false,
    3232        deterministic_output = false,
    33         useNewAST = CFA_USE_NEW_AST,
     33        useNewAST = true,
    3434        nomainp = false,
    3535        parsep = false,
  • src/Concurrency/Keywords.cc

    rebf8ca5 r23a08aa0  
    508508                ObjectDecl * vtable_object = Virtual::makeVtableForward(
    509509                        "_default_vtable_object_declaration",
    510                         vtable_decl->makeInst( move( poly_args ) ) );
     510                        vtable_decl->makeInst( std::move( poly_args ) ) );
    511511                declsToAddBefore.push_back( vtable_object );
    512512                declsToAddBefore.push_back(
     
    681681                        void lock (monitor_t & this) {
    682682                                lock(get_monitor(this));
    683                         }       
     683                        }
    684684                */
    685685                FunctionDecl * lock_decl = new FunctionDecl(
     
    700700                CompoundStmt * lock_statement = new CompoundStmt();
    701701                lock_statement->push_back(
    702                         new ExprStmt( 
     702                        new ExprStmt(
    703703                                new UntypedExpr (
    704704                                        new NameExpr( "lock" ),
     
    716716                        void unlock (monitor_t & this) {
    717717                                unlock(get_monitor(this));
    718                         }       
     718                        }
    719719                */
    720720                FunctionDecl * unlock_decl = new FunctionDecl(
     
    736736
    737737                unlock_statement->push_back(
    738                         new ExprStmt( 
     738                        new ExprStmt(
    739739                                new UntypedExpr(
    740740                                        new NameExpr( "unlock" ),
     
    746746                );
    747747                unlock_decl->set_statements( unlock_statement );
    748                
     748
    749749                // pushes routines to declsToAddAfter to add at a later time
    750750                declsToAddAfter.push_back( lock_decl );
     
    10541054                        assert( !thread_guard_decl );
    10551055                        thread_guard_decl = decl;
    1056                 } 
     1056                }
    10571057                else if ( decl->name == "__mutex_stmt_lock_guard" && decl->body ) {
    10581058                        assert( !lock_guard_decl );
     
    12061206                                                        new NameExpr( "__get_mutexstmt_lock_type" ),
    12071207                                                        { args.front()->clone() }
    1208                                                 ) 
     1208                                                )
    12091209                                        )
    12101210                                ),
     
    12251225
    12261226                StructInstType * lock_guard_struct = new StructInstType( noQualifiers, lock_guard_decl );
    1227                 TypeExpr * lock_type_expr = new TypeExpr( 
     1227                TypeExpr * lock_type_expr = new TypeExpr(
    12281228                        new TypeofType( noQualifiers, new UntypedExpr(
    12291229                                new NameExpr( "__get_mutexstmt_lock_type" ),
    12301230                                { args.front()->clone() }
    1231                                 ) 
    1232                         ) 
     1231                                )
     1232                        )
    12331233                );
    12341234
  • src/Concurrency/Waitfor.cc

    rebf8ca5 r23a08aa0  
    402402
    403403                clause.target.function = nullptr;
    404                 clause.target.arguments.empty();
     404                clause.target.arguments.clear();
    405405                clause.condition = nullptr;
    406406        }
  • src/Concurrency/WaitforNew.cpp

    rebf8ca5 r23a08aa0  
    101101namespace {
    102102
    103 class GenerateWaitForCore :
     103class GenerateWaitForCore final :
    104104                public ast::WithSymbolTable, public ast::WithConstTranslationUnit {
    105105        const ast::FunctionDecl * decl_waitfor    = nullptr;
  • src/ControlStruct/ExceptTranslateNew.cpp

    rebf8ca5 r23a08aa0  
    3232        }
    3333
    34 class TranslateThrowsCore : public ast::WithGuards {
     34class TranslateThrowsCore final : public ast::WithGuards {
    3535        const ast::ObjectDecl * terminateHandlerExcept;
    3636        enum Context { NoHandler, TerHandler, ResHandler } currentContext;
     
    136136
    137137
    138 class TryMutatorCore {
     138class TryMutatorCore final {
    139139        // The built in types used in translation.
    140140        const ast::StructDecl * except_decl;
  • src/ControlStruct/LabelFixer.cc

    rebf8ca5 r23a08aa0  
    119119
    120120// Builds a table that maps a label to its defining statement.
    121 std::map<Label, Statement * > * LabelFixer::resolveJumps() throw ( SemanticErrorException ) {
     121std::map<Label, Statement * > * LabelFixer::resolveJumps() {
    122122        std::map< Label, Statement * > *ret = new std::map< Label, Statement * >();
    123123        for ( std::map< Label, Entry * >::iterator i = labelTable.begin(); i != labelTable.end(); ++i ) {
  • src/ControlStruct/LabelFixer.h

    rebf8ca5 r23a08aa0  
    3333        LabelFixer( LabelGenerator *gen = 0 );
    3434
    35         std::map < Label, Statement * > *resolveJumps() throw ( SemanticErrorException );
     35        std::map < Label, Statement * > *resolveJumps();
    3636
    3737        // Declarations
  • src/ControlStruct/MLEMutator.cc

    rebf8ca5 r23a08aa0  
    141141
    142142
    143         Statement *MultiLevelExitMutator::postmutate( BranchStmt *branchStmt )
    144                         throw ( SemanticErrorException ) {
     143        Statement *MultiLevelExitMutator::postmutate( BranchStmt *branchStmt ) {
    145144                std::string originalTarget = branchStmt->originalTarget;
    146145
  • src/ControlStruct/MLEMutator.h

    rebf8ca5 r23a08aa0  
    4141
    4242                void premutate( CompoundStmt *cmpndStmt );
    43                 Statement * postmutate( BranchStmt *branchStmt ) throw ( SemanticErrorException );
     43                Statement * postmutate( BranchStmt *branchStmt );
    4444                void premutate( WhileDoStmt *whileDoStmt );
    4545                Statement * postmutate( WhileDoStmt *whileDoStmt );
  • src/GenPoly/GenPoly.cc

    rebf8ca5 r23a08aa0  
    99// Author           : Richard C. Bilson
    1010// Created On       : Mon May 18 07:44:20 2015
    11 // Last Modified By : Peter A. Buhr
    12 // Last Modified On : Wed Jun 29 21:45:53 2016
    13 // Update Count     : 14
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Wed Sep 14  9:24:00 2022
     13// Update Count     : 15
    1414//
    1515
     
    8383                }
    8484
     85                bool hasDynParams( const std::vector<ast::ptr<ast::Expr>> & params, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs ) {
     86                        for ( ast::ptr<ast::Expr> const & param : params ) {
     87                                auto paramType = param.as<ast::TypeExpr>();
     88                                assertf( paramType, "Aggregate parameters should be type expressions." );
     89                                if ( isDynType( paramType->type, tyVars, typeSubs ) ) {
     90                                        return true;
     91                                }
     92                        }
     93                        return false;
     94                }
     95
    8596                /// Checks a parameter list for inclusion of polymorphic parameters; will substitute according to env if present
    8697                bool includesPolyParams( std::list< Expression* >& params, const TypeSubstitution *env ) {
     
    198209                }
    199210                return 0;
     211        }
     212
     213        const ast::BaseInstType *isDynType( const ast::Type *type, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs ) {
     214                type = replaceTypeInst( type, typeSubs );
     215
     216                if ( auto inst = dynamic_cast<ast::TypeInstType const *>( type ) ) {
     217                        auto var = tyVars.find( inst->name );
     218                        if ( var != tyVars.end() && var->second.isComplete ) {
     219                                return inst;
     220                        }
     221                } else if ( auto inst = dynamic_cast<ast::StructInstType const *>( type ) ) {
     222                        if ( hasDynParams( inst->params, tyVars, typeSubs ) ) {
     223                                return inst;
     224                        }
     225                } else if ( auto inst = dynamic_cast<ast::UnionInstType const *>( type ) ) {
     226                        if ( hasDynParams( inst->params, tyVars, typeSubs ) ) {
     227                                return inst;
     228                        }
     229                }
     230                return nullptr;
    200231        }
    201232
     
    378409                inline D* as( B* p ) { return reinterpret_cast<D*>(p); }
    379410
     411                template<typename D, typename B>
     412                inline D const * as( B const * p ) {
     413                        return reinterpret_cast<D const *>( p );
     414                }
     415
    380416                /// Flattens a declaration list
    381417                template<typename Output>
     
    391427                        for ( Type* ty : src ) {
    392428                                ResolvExpr::flatten( ty, out );
     429                        }
     430                }
     431
     432                void flattenList( vector<ast::ptr<ast::Type>> const & src,
     433                                vector<ast::ptr<ast::Type>> & out ) {
     434                        for ( auto const & type : src ) {
     435                                ResolvExpr::flatten( type, out );
    393436                        }
    394437                }
     
    409452                                // if ( is<VoidType>( aparam->get_type() ) || is<VoidType>( bparam->get_type() ) ) continue;
    410453                                if ( ! typesPolyCompatible( aparam->get_type(), bparam->get_type() ) ) return false;
     454                        }
     455
     456                        return true;
     457                }
     458
     459                bool paramListsPolyCompatible(
     460                                std::vector<ast::ptr<ast::Expr>> const & lparams,
     461                                std::vector<ast::ptr<ast::Expr>> const & rparams ) {
     462                        if ( lparams.size() != rparams.size() ) {
     463                                return false;
     464                        }
     465
     466                        for ( auto lparam = lparams.begin(), rparam = rparams.begin() ;
     467                                        lparam != lparams.end() ; ++lparam, ++rparam ) {
     468                                ast::TypeExpr const * lexpr = lparam->as<ast::TypeExpr>();
     469                                assertf( lexpr, "Aggregate parameters should be type expressions" );
     470                                ast::TypeExpr const * rexpr = rparam->as<ast::TypeExpr>();
     471                                assertf( rexpr, "Aggregate parameters should be type expressions" );
     472
     473                                // xxx - might need to let VoidType be a wildcard here too; could have some voids
     474                                // stuffed in for dtype-statics.
     475                                // if ( is<VoidType>( lexpr->type() ) || is<VoidType>( bparam->get_type() ) ) continue;
     476                                if ( !typesPolyCompatible( lexpr->type, rexpr->type ) ) {
     477                                        return false;
     478                                }
    411479                        }
    412480
     
    505573        }
    506574
     575bool typesPolyCompatible( ast::Type const * lhs, ast::Type const * rhs ) {
     576        type_index const lid = typeid(*lhs);
     577
     578        // Polymorphic types always match:
     579        if ( type_index(typeid(ast::TypeInstType)) == lid ) return true;
     580
     581        type_index const rid = typeid(*rhs);
     582        if ( type_index(typeid(ast::TypeInstType)) == rid ) return true;
     583
     584        // All other types only match if they are the same type:
     585        if ( lid != rid ) return false;
     586
     587        // So remaining types can be examined case by case.
     588        // Recurse through type structure (conditions borrowed from Unify.cc).
     589
     590        if ( type_index(typeid(ast::BasicType)) == lid ) {
     591                return as<ast::BasicType>(lhs)->kind == as<ast::BasicType>(rhs)->kind;
     592        } else if ( type_index(typeid(ast::PointerType)) == lid ) {
     593                ast::PointerType const * l = as<ast::PointerType>(lhs);
     594                ast::PointerType const * r = as<ast::PointerType>(rhs);
     595
     596                // void pointers should match any other pointer type.
     597                return is<ast::VoidType>( l->base.get() )
     598                        || is<ast::VoidType>( r->base.get() )
     599                        || typesPolyCompatible( l->base.get(), r->base.get() );
     600        } else if ( type_index(typeid(ast::ReferenceType)) == lid ) {
     601                ast::ReferenceType const * l = as<ast::ReferenceType>(lhs);
     602                ast::ReferenceType const * r = as<ast::ReferenceType>(rhs);
     603
     604                // void references should match any other reference type.
     605                return is<ast::VoidType>( l->base.get() )
     606                        || is<ast::VoidType>( r->base.get() )
     607                        || typesPolyCompatible( l->base.get(), r->base.get() );
     608        } else if ( type_index(typeid(ast::ArrayType)) == lid ) {
     609                ast::ArrayType const * l = as<ast::ArrayType>(lhs);
     610                ast::ArrayType const * r = as<ast::ArrayType>(rhs);
     611
     612                if ( l->isVarLen ) {
     613                        if ( !r->isVarLen ) return false;
     614                } else {
     615                        if ( r->isVarLen ) return false;
     616
     617                        auto lc = l->dimension.as<ast::ConstantExpr>();
     618                        auto rc = r->dimension.as<ast::ConstantExpr>();
     619                        if ( lc && rc && lc->intValue() != rc->intValue() ) {
     620                                return false;
     621                        }
     622                }
     623
     624                return typesPolyCompatible( l->base.get(), r->base.get() );
     625        } else if ( type_index(typeid(ast::FunctionType)) == lid ) {
     626                ast::FunctionType const * l = as<ast::FunctionType>(lhs);
     627                ast::FunctionType const * r = as<ast::FunctionType>(rhs);
     628
     629                std::vector<ast::ptr<ast::Type>> lparams, rparams;
     630                flattenList( l->params, lparams );
     631                flattenList( r->params, rparams );
     632                if ( lparams.size() != rparams.size() ) return false;
     633                for ( unsigned i = 0; i < lparams.size(); ++i ) {
     634                        if ( !typesPolyCompatible( lparams[i], rparams[i] ) ) return false;
     635                }
     636
     637                std::vector<ast::ptr<ast::Type>> lrets, rrets;
     638                flattenList( l->returns, lrets );
     639                flattenList( r->returns, rrets );
     640                if ( lrets.size() != rrets.size() ) return false;
     641                for ( unsigned i = 0; i < lrets.size(); ++i ) {
     642                        if ( !typesPolyCompatible( lrets[i], rrets[i] ) ) return false;
     643                }
     644                return true;
     645        } else if ( type_index(typeid(ast::StructInstType)) == lid ) {
     646                ast::StructInstType const * l = as<ast::StructInstType>(lhs);
     647                ast::StructInstType const * r = as<ast::StructInstType>(rhs);
     648
     649                if ( l->name != r->name ) return false;
     650                return paramListsPolyCompatible( l->params, r->params );
     651        } else if ( type_index(typeid(ast::UnionInstType)) == lid ) {
     652                ast::UnionInstType const * l = as<ast::UnionInstType>(lhs);
     653                ast::UnionInstType const * r = as<ast::UnionInstType>(rhs);
     654
     655                if ( l->name != r->name ) return false;
     656                return paramListsPolyCompatible( l->params, r->params );
     657        } else if ( type_index(typeid(ast::EnumInstType)) == lid ) {
     658                ast::EnumInstType const * l = as<ast::EnumInstType>(lhs);
     659                ast::EnumInstType const * r = as<ast::EnumInstType>(rhs);
     660
     661                return l->name == r->name;
     662        } else if ( type_index(typeid(ast::TraitInstType)) == lid ) {
     663                ast::TraitInstType const * l = as<ast::TraitInstType>(lhs);
     664                ast::TraitInstType const * r = as<ast::TraitInstType>(rhs);
     665
     666                return l->name == r->name;
     667        } else if ( type_index(typeid(ast::TupleType)) == lid ) {
     668                ast::TupleType const * l = as<ast::TupleType>(lhs);
     669                ast::TupleType const * r = as<ast::TupleType>(rhs);
     670
     671                std::vector<ast::ptr<ast::Type>> ltypes, rtypes;
     672                flattenList( l->types, ( ltypes ) );
     673                flattenList( r->types, ( rtypes ) );
     674                if ( ltypes.size() != rtypes.size() ) return false;
     675
     676                for ( unsigned i = 0 ; i < ltypes.size() ; ++i ) {
     677                        if ( !typesPolyCompatible( ltypes[i], rtypes[i] ) ) return false;
     678                }
     679                return true;
     680        // The remaining types (VoidType, VarArgsType, ZeroType & OneType)
     681        // have no variation so will always be equal.
     682        } else {
     683                return true;
     684        }
     685}
     686
    507687        namespace {
    508688                // temporary hack to avoid re-implementing anything related to TyVarMap
  • src/GenPoly/GenPoly.h

    rebf8ca5 r23a08aa0  
    99// Author           : Richard C. Bilson
    1010// Created On       : Mon May 18 07:44:20 2015
    11 // Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sat Jul 22 09:22:57 2017
    13 // Update Count     : 7
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Fri Aug 19 16:03:00 2022
     13// Update Count     : 8
    1414//
    1515
     
    2727namespace GenPoly {
    2828
     29        // TODO Via some tricks this works for ast::TypeDecl::Data as well.
    2930        typedef ErasableScopedMap< std::string, TypeDecl::Data > TyVarMap;
     31
    3032        /// Replaces a TypeInstType by its referrent in the environment, if applicable
    3133        Type* replaceTypeInst( Type* type, const TypeSubstitution* env );
     
    4143        /// returns dynamic-layout type if is dynamic-layout type in tyVars, NULL otherwise; will look up substitution in env if provided
    4244        ReferenceToType *isDynType( Type *type, const TyVarMap &tyVars, const TypeSubstitution *env = 0 );
     45        const ast::BaseInstType *isDynType( const ast::Type *type, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs = 0 );
    4346
    4447        /// true iff function has dynamic-layout return type under the given type variable map
     
    8386        /// true iff types are structurally identical, where TypeInstType's match any type.
    8487        bool typesPolyCompatible( Type *aty, Type *bty );
     88        bool typesPolyCompatible( ast::Type const * lhs, ast::Type const * rhs );
    8589
    8690        /// true if arg requires boxing given exprTyVars
  • src/GenPoly/InstantiateGeneric.h

    rebf8ca5 r23a08aa0  
    1919
    2020class Declaration;
     21namespace ast {
     22        class TranslationUnit;
     23}
    2124
    2225namespace GenPoly {
    23         /// Replaces all generic types that have static layout with concrete instantiations.
    24         /// Types with concrete values for otype parameters will be template-expanded, while
    25         /// dtype and ftype parameters will be replaced by the appropriate void type.
    26         void instantiateGeneric( std::list< Declaration* > &translationUnit );
     26/// Replaces all generic types that have static layout with concrete
     27/// instantiations. Types with concrete values for otype parameters will be
     28/// template-expanded, while dtype and ftype parameters will be replaced by
     29/// the appropriate void type.
     30void instantiateGeneric( std::list< Declaration* > &translationUnit );
     31void instantiateGeneric( ast::TranslationUnit & translationUnit );
    2732} // namespace GenPoly
    2833
  • src/GenPoly/Lvalue2.cc

    rebf8ca5 r23a08aa0  
    2323}
    2424
    25 
    2625}
  • src/GenPoly/ScrubTyVars.cc

    rebf8ca5 r23a08aa0  
    99// Author           : Richard C. Bilson
    1010// Created On       : Mon May 18 07:44:20 2015
    11 // Last Modified By : Peter A. Buhr
    12 // Last Modified On : Thu Mar 16 15:44:27 2017
    13 // Update Count     : 3
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Fri Aug 19 16:10:00 2022
     13// Update Count     : 4
    1414//
    1515
    1616#include <utility>                      // for pair
    1717
     18#include "AST/Pass.hpp"
    1819#include "GenPoly.h"                    // for mangleType, TyVarMap, alignof...
    1920#include "GenPoly/ErasableScopedMap.h"  // for ErasableScopedMap<>::const_it...
    2021#include "ScrubTyVars.h"
     22#include "SymTab/Mangler.h"             // for mangle, typeMode
    2123#include "SynTree/Declaration.h"        // for TypeDecl, TypeDecl::Data, Typ...
    2224#include "SynTree/Expression.h"         // for Expression (ptr only), NameExpr
     
    112114                return pointer;
    113115        }
     116
     117namespace {
     118
     119enum class ScrubMode {
     120        FromMap,
     121        DynamicFromMap,
     122        All,
     123};
     124
     125struct ScrubTypeVars :
     126        public ast::WithGuards,
     127        public ast::WithShortCircuiting,
     128        public ast::WithVisitorRef<ScrubTypeVars> {
     129
     130        ScrubTypeVars( ScrubMode m, TyVarMap const * tv ) :
     131                        mode ( m ), typeVars( tv ) {}
     132
     133        void previsit( ast::TypeInstType const * ) { visit_children = false; }
     134        void previsit( ast::StructInstType const * ) { visit_children = false; }
     135        void previsit( ast::UnionInstType const * ) { visit_children = false; }
     136        void previsit( ast::SizeofExpr const * expr ) { primeBaseScrub( expr->type ); }
     137        void previsit( ast::AlignofExpr const * expr ) { primeBaseScrub( expr->type ); }
     138        void previsit( ast::PointerType const * type ) { primeBaseScrub( type->base ); }
     139
     140        ast::Type const * postvisit( ast::TypeInstType const * type );
     141        ast::Type const * postvisit( ast::StructInstType const * type );
     142        ast::Type const * postvisit( ast::UnionInstType const * type );
     143        ast::Expr const * postvisit( ast::SizeofExpr const * expr );
     144        ast::Expr const * postvisit( ast::AlignofExpr const * expr );
     145        ast::Type const * postvisit( ast::PointerType const * type );
     146
     147private:
     148        ScrubMode const mode;
     149        /// Type varriables to scrub.
     150        TyVarMap const * const typeVars;
     151        /// Value cached by primeBaseScrub.
     152        ast::Type const * dynType = nullptr;
     153
     154        /// Returns the type if it should be scrubbed, nullptr otherwise.
     155        ast::Type const * shouldScrub( ast::Type const * type ) {
     156                switch ( mode ) {
     157                case ScrubMode::FromMap:
     158                        return isPolyType( type, *typeVars );
     159                case ScrubMode::DynamicFromMap:
     160                        return isDynType( type, *typeVars );
     161                case ScrubMode::All:
     162                        return isPolyType( type );
     163                default:
     164                        assertf( false, "Invalid ScrubMode in shouldScrub." );
     165                        throw;
     166                }
     167        }
     168
     169        void primeBaseScrub( ast::Type const * type ) {
     170                // Need to determine whether type needs to be scrubbed to
     171                // determine whether automatic recursion is necessary.
     172                if ( ast::Type const * t = shouldScrub( type ) ) {
     173                        visit_children = false;
     174                        GuardValue( dynType ) = t;
     175                }
     176        }
     177
     178        ast::Type const * postvisitAggregateType(
     179                        ast::BaseInstType const * type ) {
     180                if ( !shouldScrub( type ) ) return type;
     181                return new ast::PointerType( new ast::VoidType( type->qualifiers ) );
     182        }
     183};
     184
     185ast::Type const * ScrubTypeVars::postvisit( ast::TypeInstType const * type ) {
     186        // This implies that mode == ScrubMode::All.
     187        if ( !typeVars ) {
     188                if ( ast::TypeDecl::Ftype == type->kind ) {
     189                        return new ast::PointerType(
     190                                new ast::FunctionType( ast::FixedArgs ) );
     191                } else {
     192                        return new ast::PointerType(
     193                                new ast::VoidType( type->qualifiers ) );
     194                }
     195        }
     196
     197        auto typeVar = typeVars->find( type->name );
     198        if ( typeVar == typeVars->end() ) {
     199                return type;
     200        }
     201
     202        switch ( typeVar->second.kind ) {
     203        case ast::TypeDecl::Dtype:
     204        case ast::TypeDecl::Ttype:
     205                return new ast::PointerType(
     206                        new ast::VoidType( type->qualifiers ) );
     207        case ast::TypeDecl::Ftype:
     208                return new ast::PointerType(
     209                        new ast::FunctionType( ast::VariableArgs ) );
     210        default:
     211                assertf( false,
     212                        "Unhandled type variable kind: %d", typeVar->second.kind );
     213                throw; // Just in case the assert is removed, stop here.
     214        }
     215}
     216
     217ast::Type const * ScrubTypeVars::postvisit( ast::StructInstType const * type ) {
     218        return postvisitAggregateType( type );
     219}
     220
     221ast::Type const * ScrubTypeVars::postvisit( ast::UnionInstType const * type ) {
     222        return postvisitAggregateType( type );
     223}
     224
     225ast::Expr const * ScrubTypeVars::postvisit( ast::SizeofExpr const * expr ) {
     226        // sizeof( T ) becomes the _sizeof_T parameter.
     227        if ( dynType ) {
     228                return new ast::NameExpr( expr->location,
     229                        sizeofName( Mangle::mangle( dynType, Mangle::typeMode() ) ) );
     230        } else {
     231                return expr;
     232        }
     233}
     234
     235ast::Expr const * ScrubTypeVars::postvisit( ast::AlignofExpr const * expr ) {
     236        // alignof( T ) becomes the _alignof_T parameter.
     237        if ( dynType ) {
     238                return new ast::NameExpr( expr->location,
     239                        alignofName( Mangle::mangle( dynType, Mangle::typeMode() ) ) );
     240        } else {
     241                return expr;
     242        }
     243}
     244
     245ast::Type const * ScrubTypeVars::postvisit( ast::PointerType const * type ) {
     246        if ( dynType ) {
     247                ast::Type * ret = ast::mutate( dynType->accept( *visitor ) );
     248                ret->qualifiers |= type->qualifiers;
     249                return ret;
     250        } else {
     251                return type;
     252        }
     253}
     254
     255const ast::Node * scrubTypeVarsBase(
     256                const ast::Node * target,
     257                ScrubMode mode, const TyVarMap * typeVars ) {
     258        if ( ScrubMode::All == mode ) {
     259                assert( nullptr == typeVars );
     260        } else {
     261                assert( nullptr != typeVars );
     262        }
     263        ast::Pass<ScrubTypeVars> visitor( mode, typeVars );
     264        return target->accept( visitor );
     265}
     266
     267} // namespace
     268
     269template<>
     270ast::Node const * scrubAllTypeVars<ast::Node>( const ast::Node * target ) {
     271        return scrubTypeVarsBase( target, ScrubMode::All, nullptr );
     272}
     273
    114274} // namespace GenPoly
    115275
  • src/GenPoly/ScrubTyVars.h

    rebf8ca5 r23a08aa0  
    99// Author           : Richard C. Bilson
    1010// Created On       : Mon May 18 07:44:20 2015
    11 // Last Modified By : Peter A. Buhr
    12 // Last Modified On : Sat Jul 22 09:21:47 2017
    13 // Update Count     : 2
     11// Last Modified By : Andrew Beach
     12// Last Modified On : Fri Aug 19 14:14:00 2022
     13// Update Count     : 3
    1414//
    1515
     
    1818#include <cassert>            // for assert
    1919
     20#include "AST/Fwd.hpp"        // for Node
    2021#include "Common/PassVisitor.h"
    2122#include "GenPoly.h"          // for TyVarMap, isPolyType, isDynType
     
    108109        }
    109110
     111/// For all polymorphic types, replaces generic types, with the appropriate
     112/// void type, and sizeof/alignof expressions with the proper variable.
     113template<typename node_t>
     114node_t const * scrubAllTypeVars( node_t const * target ) {
     115        return strict_dynamic_cast<node_t const *>( scrubAllTypeVars<ast::Node>( target ) );
     116}
     117
     118template<>
     119ast::Node const * scrubAllTypeVars<ast::Node>( const ast::Node * target );
     120
    110121} // namespace GenPoly
    111122
  • src/GenPoly/SpecializeNew.cpp

    rebf8ca5 r23a08aa0  
    240240}
    241241
    242 namespace {
    243         struct TypeInstFixer : public ast::WithShortCircuiting {
    244                 std::map<const ast::TypeDecl *, std::pair<int, int>> typeMap;
    245 
    246                 void previsit(const ast::TypeDecl *) { visit_children = false; }
    247                 const ast::TypeInstType * postvisit(const ast::TypeInstType * typeInst) {
    248                         if (typeMap.count(typeInst->base)) {
    249                                 ast::TypeInstType * newInst = mutate(typeInst);
    250                                 auto const & pair = typeMap[typeInst->base];
    251                                 newInst->expr_id = pair.first;
    252                                 newInst->formal_usage = pair.second;
    253                                 return newInst;
    254                         }
    255                         return typeInst;
    256                 }
    257         };
    258 }
     242struct TypeInstFixer final : public ast::WithShortCircuiting {
     243        std::map<const ast::TypeDecl *, std::pair<int, int>> typeMap;
     244
     245        void previsit(const ast::TypeDecl *) { visit_children = false; }
     246        const ast::TypeInstType * postvisit(const ast::TypeInstType * typeInst) {
     247                if (typeMap.count(typeInst->base)) {
     248                        ast::TypeInstType * newInst = mutate(typeInst);
     249                        auto const & pair = typeMap[typeInst->base];
     250                        newInst->expr_id = pair.first;
     251                        newInst->formal_usage = pair.second;
     252                        return newInst;
     253                }
     254                return typeInst;
     255        }
     256};
    259257
    260258const ast::Expr * SpecializeCore::createThunkFunction(
  • src/GenPoly/module.mk

    rebf8ca5 r23a08aa0  
    2727        GenPoly/FindFunction.cc \
    2828        GenPoly/FindFunction.h \
     29        GenPoly/InstantiateGenericNew.cpp \
    2930        GenPoly/InstantiateGeneric.cc \
    3031        GenPoly/InstantiateGeneric.h \
  • src/InitTweak/InitTweak.cc

    rebf8ca5 r23a08aa0  
    12411241        static const char * const tlsd_section = ".tdata" ASM_COMMENT;
    12421242        void addDataSectionAttribute( ObjectDecl * objDecl ) {
    1243                 const bool is_tls = objDecl->get_storageClasses().is_threadlocal;
     1243                const bool is_tls = objDecl->get_storageClasses().is_threadlocal_any();
    12441244                const char * section = is_tls ? tlsd_section : data_section;
    12451245                objDecl->attributes.push_back(new Attribute("section", {
     
    12491249
    12501250        void addDataSectionAttribute( ast::ObjectDecl * objDecl ) {
    1251                 const bool is_tls = objDecl->storage.is_threadlocal;
     1251                const bool is_tls = objDecl->storage.is_threadlocal_any();
    12521252                const char * section = is_tls ? tlsd_section : data_section;
    12531253                objDecl->attributes.push_back(new ast::Attribute("section", {
  • src/Makefile.am

    rebf8ca5 r23a08aa0  
    7171EXTRA_DIST = include/cassert include/optional BasicTypes-gen.cc
    7272
    73 AM_CXXFLAGS = @HOST_FLAGS@ -Wno-deprecated -Wall -Wextra -Werror=return-type -DDEBUG_ALL -I./Parser -I$(srcdir)/Parser -I$(srcdir)/include -DYY_NO_INPUT -O3 -g -std=c++14 $(TCMALLOCFLAG)
     73AM_CXXFLAGS = @HOST_FLAGS@ -Wno-deprecated -Wall -Wextra -Werror=return-type -DDEBUG_ALL -I./Parser -I$(srcdir)/Parser -I$(srcdir)/include -DYY_NO_INPUT -O3 -g -std=c++17 $(TCMALLOCFLAG)
    7474AM_LDFLAGS  = @HOST_FLAGS@ -Xlinker -export-dynamic
    7575ARFLAGS     = cr
  • src/Parser/DeclarationNode.cc

    rebf8ca5 r23a08aa0  
    262262        newnode->type->enumeration.anon = name == nullptr;
    263263        if ( base && base->type)  {
    264                 newnode->type->base = base->type;       
     264                newnode->type->base = base->type;
    265265        } // if
    266266
     
    505505                        } // for
    506506                        // src is the new item being added and has a single bit
    507                 } else if ( ! src->storageClasses.is_threadlocal ) { // conflict ?
     507                } else if ( ! src->storageClasses.is_threadlocal_any() ) { // conflict ?
    508508                        appendError( error, string( "conflicting " ) + Type::StorageClassesNames[storageClasses.ffs()] +
    509509                                                 " & " + Type::StorageClassesNames[src->storageClasses.ffs()] );
  • src/Parser/lex.ll

    rebf8ca5 r23a08aa0  
    1010 * Created On       : Sat Sep 22 08:58:10 2001
    1111 * Last Modified By : Peter A. Buhr
    12  * Last Modified On : Sun Jun 20 18:41:09 2021
    13  * Update Count     : 759
     12 * Last Modified On : Tue Aug 30 18:39:54 2022
     13 * Update Count     : 760
    1414 */
    1515
     
    314314switch                  { KEYWORD_RETURN(SWITCH); }
    315315thread                  { KEYWORD_RETURN(THREAD); }                             // C11
    316 _Thread_local   { KEYWORD_RETURN(THREADLOCAL); }                // C11
     316__thread                { KEYWORD_RETURN(THREADLOCALGCC); }             // GCC
     317_Thread_local   { KEYWORD_RETURN(THREADLOCALC11); }             // C11
    317318throw                   { KEYWORD_RETURN(THROW); }                              // CFA
    318319throwResume             { KEYWORD_RETURN(THROWRESUME); }                // CFA
  • src/Parser/parser.yy

    rebf8ca5 r23a08aa0  
    5858
    5959// lex uses __null in a boolean context, it's fine.
    60 //#pragma GCC diagnostic ignored "-Wparentheses-equality"
     60#pragma GCC diagnostic ignored "-Wpragmas"
     61#pragma GCC diagnostic ignored "-Wparentheses-equality"
     62#pragma GCC diagnostic warning "-Wpragmas"
    6163
    6264extern DeclarationNode * parseTree;
     
    293295%token TYPEDEF
    294296%token EXTERN STATIC AUTO REGISTER
    295 %token THREADLOCAL                                                                              // C11
     297%token THREADLOCALGCC THREADLOCALC11                                            // GCC, C11
    296298%token INLINE FORTRAN                                                                   // C99, extension ISO/IEC 9899:1999 Section J.5.9(1)
    297299%token NORETURN                                                                                 // C11
     
    13451347                {
    13461348                        if ( $2 == OperKinds::LThan || $2 == OperKinds::LEThan ) { SemanticError( yylloc, MISSING_ANON_FIELD ); $$ = nullptr; }
    1347                         else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; } 
     1349                        else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; }
    13481350                }
    13491351        | comma_expression updowneq comma_expression '~' comma_expression // CFA, anonymous loop-index
     
    13571359                {
    13581360                        if ( $2 == OperKinds::LThan || $2 == OperKinds::LEThan ) { SemanticError( yylloc, MISSING_ANON_FIELD ); $$ = nullptr; }
    1359                         else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; } 
     1361                        else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; }
    13601362                }
    13611363        | comma_expression updowneq comma_expression '~' '@' // CFA, error
     
    20822084        | REGISTER
    20832085                { $$ = DeclarationNode::newStorageClass( Type::Register ); }
    2084         | THREADLOCAL                                                                           // C11
    2085                 { $$ = DeclarationNode::newStorageClass( Type::Threadlocal ); }
     2086        | THREADLOCALGCC                                                                                // GCC
     2087                { $$ = DeclarationNode::newStorageClass( Type::ThreadlocalGcc ); }
     2088        | THREADLOCALC11                                                                                // C11
     2089                { $$ = DeclarationNode::newStorageClass( Type::ThreadlocalC11 ); }
    20862090                // Put function specifiers here to simplify parsing rules, but separate them semantically.
    20872091        | INLINE                                                                                        // C99
  • src/ResolvExpr/CandidateFinder.cpp

    rebf8ca5 r23a08aa0  
    269269                        unsigned nextArg, unsigned tupleStart = 0, Cost cost = Cost::zero,
    270270                        unsigned nextExpl = 0, unsigned explAlt = 0 )
    271                 : parent(parent), expr( expr ), cost( cost ), env( move( env ) ), need( move( need ) ),
    272                   have( move( have ) ), open( move( open ) ), nextArg( nextArg ), tupleStart( tupleStart ),
     271                : parent(parent), expr( expr ), cost( cost ), env( std::move( env ) ), need( std::move( need ) ),
     272                  have( std::move( have ) ), open( std::move( open ) ), nextArg( nextArg ), tupleStart( tupleStart ),
    273273                  nextExpl( nextExpl ), explAlt( explAlt ) {}
    274274
     
    276276                        const ArgPack & o, ast::TypeEnvironment && env, ast::AssertionSet && need,
    277277                        ast::AssertionSet && have, ast::OpenVarSet && open, unsigned nextArg, Cost added )
    278                 : parent( o.parent ), expr( o.expr ), cost( o.cost + added ), env( move( env ) ),
    279                   need( move( need ) ), have( move( have ) ), open( move( open ) ), nextArg( nextArg ),
     278                : parent( o.parent ), expr( o.expr ), cost( o.cost + added ), env( std::move( env ) ),
     279                  need( std::move( need ) ), have( std::move( have ) ), open( std::move( open ) ), nextArg( nextArg ),
    280280                  tupleStart( o.tupleStart ), nextExpl( 0 ), explAlt( 0 ) {}
    281281
     
    301301                        // reset pack to appropriate tuple
    302302                        std::vector< ast::ptr< ast::Expr > > exprv( exprs.begin(), exprs.end() );
    303                         expr = new ast::TupleExpr{ expr->location, move( exprv ) };
     303                        expr = new ast::TupleExpr{ expr->location, std::move( exprv ) };
    304304                        tupleStart = pack->tupleStart - 1;
    305305                        parent = pack->parent;
     
    404404                                                                newResult.open, symtab )
    405405                                                ) {
    406                                                         finalResults.emplace_back( move( newResult ) );
     406                                                        finalResults.emplace_back( std::move( newResult ) );
    407407                                                }
    408408
     
    423423                                                if ( expl.exprs.empty() ) {
    424424                                                        results.emplace_back(
    425                                                                 results[i], move( env ), copy( results[i].need ),
    426                                                                 copy( results[i].have ), move( open ), nextArg + 1, expl.cost );
     425                                                                results[i], std::move( env ), copy( results[i].need ),
     426                                                                copy( results[i].have ), std::move( open ), nextArg + 1, expl.cost );
    427427
    428428                                                        continue;
     
    431431                                                // add new result
    432432                                                results.emplace_back(
    433                                                         i, expl.exprs.front(), move( env ), copy( results[i].need ),
    434                                                         copy( results[i].have ), move( open ), nextArg + 1, nTuples,
     433                                                        i, expl.exprs.front(), std::move( env ), copy( results[i].need ),
     434                                                        copy( results[i].have ), std::move( open ), nextArg + 1, nTuples,
    435435                                                        expl.cost, expl.exprs.size() == 1 ? 0 : 1, j );
    436436                                        }
     
    444444                        // splice final results onto results
    445445                        for ( std::size_t i = 0; i < finalResults.size(); ++i ) {
    446                                 results.emplace_back( move( finalResults[i] ) );
     446                                results.emplace_back( std::move( finalResults[i] ) );
    447447                        }
    448448                        return ! finalResults.empty();
     
    478478
    479479                                        results.emplace_back(
    480                                                 i, expr, move( env ), move( need ), move( have ), move( open ), nextArg,
     480                                                i, expr, std::move( env ), std::move( need ), std::move( have ), std::move( open ), nextArg,
    481481                                                nTuples, Cost::zero, nextExpl, results[i].explAlt );
    482482                                }
     
    494494                                        if ( unify( paramType, cnst->result, env, need, have, open, symtab ) ) {
    495495                                                results.emplace_back(
    496                                                         i, new ast::DefaultArgExpr{ cnst->location, cnst }, move( env ),
    497                                                         move( need ), move( have ), move( open ), nextArg, nTuples );
     496                                                        i, new ast::DefaultArgExpr{ cnst->location, cnst }, std::move( env ),
     497                                                        std::move( need ), std::move( have ), std::move( open ), nextArg, nTuples );
    498498                                        }
    499499                                }
     
    516516                                if ( expl.exprs.empty() ) {
    517517                                        results.emplace_back(
    518                                                 results[i], move( env ), move( need ), move( have ), move( open ),
     518                                                results[i], std::move( env ), std::move( need ), std::move( have ), std::move( open ),
    519519                                                nextArg + 1, expl.cost );
    520520
     
    538538                                        // add new result
    539539                                        results.emplace_back(
    540                                                 i, expr, move( env ), move( need ), move( have ), move( open ),
     540                                                i, expr, std::move( env ), std::move( need ), std::move( have ), std::move( open ),
    541541                                                nextArg + 1, nTuples, expl.cost, expl.exprs.size() == 1 ? 0 : 1, j );
    542542                                }
     
    576576                                        restructureCast( idx, toType->getComponent( i ), isGenerated ) );
    577577                        }
    578                         return new ast::TupleExpr{ arg->location, move( components ) };
     578                        return new ast::TupleExpr{ arg->location, std::move( components ) };
    579579                } else {
    580580                        // handle normally
     
    672672                        }
    673673                        std::vector< ast::ptr< ast::Expr > > vargs( args.begin(), args.end() );
    674                         appExpr->args = move( vargs );
     674                        appExpr->args = std::move( vargs );
    675675                        // build and validate new candidate
    676676                        auto newCand =
     
    783783                                                        if ( expl.exprs.empty() ) {
    784784                                                                results.emplace_back(
    785                                                                         results[i], move( env ), copy( results[i].need ),
    786                                                                         copy( results[i].have ), move( open ), nextArg + 1,
     785                                                                        results[i], std::move( env ), copy( results[i].need ),
     786                                                                        copy( results[i].have ), std::move( open ), nextArg + 1,
    787787                                                                        expl.cost );
    788788
     
    792792                                                        // add new result
    793793                                                        results.emplace_back(
    794                                                                 i, expl.exprs.front(), move( env ), copy( results[i].need ),
    795                                                                 copy( results[i].have ), move( open ), nextArg + 1, 0, expl.cost,
     794                                                                i, expl.exprs.front(), std::move( env ), copy( results[i].need ),
     795                                                                copy( results[i].have ), std::move( open ), nextArg + 1, 0, expl.cost,
    796796                                                                expl.exprs.size() == 1 ? 0 : 1, j );
    797797                                                }
     
    843843                                // as a member expression
    844844                                addAnonConversions( newCand );
    845                                 candidates.emplace_back( move( newCand ) );
     845                                candidates.emplace_back( std::move( newCand ) );
    846846                        }
    847847                }
     
    901901                                                        const ast::EnumDecl * enumDecl = enumInst->base;
    902902                                                        if ( const ast::Type* enumType = enumDecl->base ) {
    903                                                                 // instance of enum (T) is a instance of type (T) 
     903                                                                // instance of enum (T) is a instance of type (T)
    904904                                                                funcFinder.otypeKeys.insert(Mangle::mangle(enumType, Mangle::NoGenericParams | Mangle::Type));
    905905                                                        } else {
     
    907907                                                                funcFinder.otypeKeys.insert(Mangle::mangle(enumDecl, Mangle::NoGenericParams | Mangle::Type));
    908908                                                        }
    909                                                 } 
     909                                                }
    910910                                                else funcFinder.otypeKeys.insert(Mangle::mangle(argType, Mangle::NoGenericParams | Mangle::Type));
    911911                                        }
     
    986986                                        funcE.emplace_back( *func, symtab );
    987987                                }
    988                                 argExpansions.emplace_front( move( funcE ) );
     988                                argExpansions.emplace_front( std::move( funcE ) );
    989989
    990990                                for ( const CandidateRef & op : opFinder ) {
     
    10301030                                if ( cvtCost != Cost::infinity ) {
    10311031                                        withFunc->cvtCost = cvtCost;
    1032                                         candidates.emplace_back( move( withFunc ) );
    1033                                 }
    1034                         }
    1035                         found = move( candidates );
     1032                                        candidates.emplace_back( std::move( withFunc ) );
     1033                                }
     1034                        }
     1035                        found = std::move( candidates );
    10361036
    10371037                        // use a new list so that candidates are not examined by addAnonConversions twice
     
    11311131                                        CandidateRef newCand = std::make_shared<Candidate>(
    11321132                                                restructureCast( cand->expr, toType, castExpr->isGenerated ),
    1133                                                 copy( cand->env ), move( open ), move( need ), cand->cost,
     1133                                                copy( cand->env ), std::move( open ), std::move( need ), cand->cost,
    11341134                                                cand->cost + thisCost );
    11351135                                        inferParameters( newCand, matches );
     
    12851285                                // as a name expression
    12861286                                addAnonConversions( newCand );
    1287                                 candidates.emplace_back( move( newCand ) );
     1287                                candidates.emplace_back( std::move( newCand ) );
    12881288                        }
    12891289                }
     
    13941394                                                new ast::LogicalExpr{
    13951395                                                        logicalExpr->location, r1->expr, r2->expr, logicalExpr->isAnd },
    1396                                                 move( env ), move( open ), move( need ), r1->cost + r2->cost );
     1396                                                std::move( env ), std::move( open ), std::move( need ), r1->cost + r2->cost );
    13971397                                }
    13981398                        }
     
    14521452                                                        // output candidate
    14531453                                                        CandidateRef newCand = std::make_shared<Candidate>(
    1454                                                                 newExpr, move( env ), move( open ), move( need ), cost );
     1454                                                                newExpr, std::move( env ), std::move( open ), std::move( need ), cost );
    14551455                                                        inferParameters( newCand, candidates );
    14561456                                                }
     
    15191519                                                // add candidate
    15201520                                                CandidateRef newCand = std::make_shared<Candidate>(
    1521                                                         newExpr, move( env ), move( open ), move( need ),
     1521                                                        newExpr, std::move( env ), std::move( open ), std::move( need ),
    15221522                                                        r1->cost + r2->cost );
    15231523                                                inferParameters( newCand, candidates );
     
    15481548
    15491549                                addCandidate(
    1550                                         new ast::TupleExpr{ tupleExpr->location, move( exprs ) },
    1551                                         move( env ), move( open ), move( need ), sumCost( subs ) );
     1550                                        new ast::TupleExpr{ tupleExpr->location, std::move( exprs ) },
     1551                                        std::move( env ), std::move( open ), std::move( need ), sumCost( subs ) );
    15521552                        }
    15531553                }
     
    16351635                                                                initExpr->location, restructureCast( cand->expr, toType ),
    16361636                                                                initAlt.designation },
    1637                                                         move(env), move( open ), move( need ), cand->cost, thisCost );
     1637                                                        std::move(env), std::move( open ), std::move( need ), cand->cost, thisCost );
    16381638                                                inferParameters( newCand, matches );
    16391639                                        }
     
    17681768                cand->env.applyFree( newResult );
    17691769                cand->expr = ast::mutate_field(
    1770                         cand->expr.get(), &ast::Expr::result, move( newResult ) );
     1770                        cand->expr.get(), &ast::Expr::result, std::move( newResult ) );
    17711771
    17721772                out.emplace_back( cand );
     
    18541854
    18551855                auto oldsize = candidates.size();
    1856                 candidates = move( pruned );
     1856                candidates = std::move( pruned );
    18571857
    18581858                PRINT(
  • src/SynTree/Statement.cc

    rebf8ca5 r23a08aa0  
    105105};
    106106
    107 BranchStmt::BranchStmt( Label target, Type type ) throw ( SemanticErrorException ) :
     107BranchStmt::BranchStmt( Label target, Type type ) :
    108108        Statement(), originalTarget( target ), target( target ), computedTarget( nullptr ), type( type ) {
    109109        //actually this is a syntactic error signaled by the parser
     
    113113}
    114114
    115 BranchStmt::BranchStmt( Expression * computedTarget, Type type ) throw ( SemanticErrorException ) :
     115BranchStmt::BranchStmt( Expression * computedTarget, Type type ) :
    116116        Statement(), computedTarget( computedTarget ), type( type ) {
    117117        if ( type != BranchStmt::Goto || computedTarget == nullptr ) {
     
    211211}
    212212
    213 CaseStmt::CaseStmt( Expression * condition, const list<Statement *> & statements, bool deflt ) throw ( SemanticErrorException ) :
     213CaseStmt::CaseStmt( Expression * condition, const list<Statement *> & statements, bool deflt ) :
    214214                Statement(), condition( condition ), stmts( statements ), _isDefault( deflt ) {
    215215        if ( isDefault() && condition != nullptr ) SemanticError( condition, "default case with condition: " );
     
    575575}
    576576
    577 MutexStmt::MutexStmt( Statement * stmt, const list<Expression *> mutexObjs ) 
     577MutexStmt::MutexStmt( Statement * stmt, const list<Expression *> mutexObjs )
    578578        : Statement(), stmt( stmt ), mutexObjs( mutexObjs ) { }
    579579
  • src/SynTree/Statement.h

    rebf8ca5 r23a08aa0  
    200200        std::list<Statement *> stmts;
    201201
    202         CaseStmt( Expression * conditions, const std::list<Statement *> & stmts, bool isdef = false ) throw (SemanticErrorException);
     202        CaseStmt( Expression * conditions, const std::list<Statement *> & stmts, bool isdef = false );
    203203        CaseStmt( const CaseStmt & other );
    204204        virtual ~CaseStmt();
     
    289289        Type type;
    290290
    291         BranchStmt( Label target, Type ) throw (SemanticErrorException);
    292         BranchStmt( Expression * computedTarget, Type ) throw (SemanticErrorException);
     291        BranchStmt( Label target, Type );
     292        BranchStmt( Expression * computedTarget, Type );
    293293
    294294        Label get_originalTarget() { return originalTarget; }
  • src/SynTree/Type.cc

    rebf8ca5 r23a08aa0  
    8080// These must remain in the same order as the corresponding bit fields.
    8181const char * Type::FuncSpecifiersNames[] = { "inline", "_Noreturn", "fortran" };
    82 const char * Type::StorageClassesNames[] = { "extern", "static", "auto", "register", "_Thread_local" };
     82const char * Type::StorageClassesNames[] = { "extern", "static", "auto", "register", "__thread", "_Thread_local" };
    8383const char * Type::QualifiersNames[] = { "const", "restrict", "volatile", "mutex", "_Atomic" };
    8484
  • src/SynTree/Type.h

    rebf8ca5 r23a08aa0  
    8484        }; // FuncSpecifiers
    8585
    86         enum { Extern = 1 << 0, Static = 1 << 1, Auto = 1 << 2, Register = 1 << 3, Threadlocal = 1 << 4, NumStorageClass = 5 };
     86        enum { Extern = 1 << 0, Static = 1 << 1, Auto = 1 << 2, Register = 1 << 3, ThreadlocalGcc = 1 << 4, ThreadlocalC11 = 1 << 5, NumStorageClass = 6 };
    8787        static const char * StorageClassesNames[];
    8888        union StorageClasses {
     
    9393                        bool is_auto : 1;
    9494                        bool is_register : 1;
    95                         bool is_threadlocal : 1;
     95                        bool is_threadlocalGcc : 1;
     96                        bool is_threadlocalC11 : 1;
    9697                };
    9798
     
    100101                // equality (==, !=) works implicitly on first field "val", relational operations are undefined.
    101102                BFCommon( StorageClasses, NumStorageClass )
     103
     104                bool is_threadlocal_any() { return this->is_threadlocalC11 || this->is_threadlocalGcc; }
    102105        }; // StorageClasses
    103106
  • src/Tuples/TupleExpansionNew.cpp

    rebf8ca5 r23a08aa0  
    101101
    102102/// Replaces Tuple Assign & Index Expressions, and Tuple Types.
    103 struct TupleMainExpander :
     103struct TupleMainExpander final :
    104104                public ast::WithGuards,
    105105                public ast::WithVisitorRef<TupleMainExpander>,
     
    254254}
    255255
    256 struct TupleExprExpander {
     256struct TupleExprExpander final {
    257257        ast::Expr const * postvisit( ast::TupleExpr const * expr ) {
    258258                return replaceTupleExpr( expr->location,
  • src/Virtual/ExpandCasts.cc

    rebf8ca5 r23a08aa0  
    317317};
    318318
    319 struct ExpandCastsCore {
     319struct ExpandCastsCore final {
    320320        void previsit( ast::FunctionDecl const * decl );
    321321        void previsit( ast::StructDecl const * decl );
     
    362362}
    363363
     364/// Copy newType, but give the copy the params of the oldType.
    364365ast::StructInstType * polyCopy(
    365366                ast::StructInstType const * oldType,
  • src/config.h.in

    rebf8ca5 r23a08aa0  
    2727/* Location of cfa install. */
    2828#undef CFA_PREFIX
    29 
    30 /* Sets whether or not to use the new-ast, this is adefault value and can be
    31    overrided by --old-ast and --new-ast */
    32 #undef CFA_USE_NEW_AST
    3329
    3430/* Major.Minor */
  • src/main.cc

    rebf8ca5 r23a08aa0  
    1010// Created On       : Fri May 15 23:12:02 2015
    1111// Last Modified By : Andrew Beach
    12 // Last Modified On : Thu 11 12:18:00 2022
    13 // Update Count     : 677
     12// Last Modified On : Thu Sep 15 13:58:00 2022
     13// Update Count     : 678
    1414//
    1515
     
    3838#include "CodeGen/Generate.h"               // for generate
    3939#include "CodeGen/LinkOnce.h"               // for translateLinkOnce
    40 #include "CodeTools/DeclStats.h"            // for printDeclStats
    41 #include "CodeTools/ResolvProtoDump.h"      // for dumpAsResolvProto
    4240#include "CodeTools/TrackLoc.h"             // for fillLocations
    4341#include "Common/CodeLocationTools.hpp"     // for forceFillCodeLocations
     
    4543#include "Common/DeclStats.hpp"             // for printDeclStats
    4644#include "Common/ResolvProtoDump.hpp"       // for dumpAsResolverProto
    47 #include "Common/Stats.h"
    48 #include "Common/PassVisitor.h"
    49 #include "Common/SemanticError.h"           // for SemanticError
     45#include "Common/Stats.h"                   // for Stats
    5046#include "Common/UnimplementedError.h"      // for UnimplementedError
    5147#include "Common/utility.h"                 // for deleteAll, filter, printAll
     
    5349#include "Concurrency/Waitfor.h"            // for generateWaitfor
    5450#include "ControlStruct/ExceptDecl.h"       // for translateExcept
    55 #include "ControlStruct/ExceptTranslate.h"  // for translateEHM
     51#include "ControlStruct/ExceptTranslate.h"  // for translateThrows, translat...
    5652#include "ControlStruct/FixLabels.hpp"      // for fixLabels
    5753#include "ControlStruct/HoistControlDecls.hpp" //  hoistControlDecls
    58 #include "ControlStruct/Mutate.h"           // for mutate
    5954#include "GenPoly/Box.h"                    // for box
    6055#include "GenPoly/InstantiateGeneric.h"     // for instantiateGeneric
     
    6661#include "Parser/ParseNode.h"               // for DeclarationNode, buildList
    6762#include "Parser/TypedefTable.h"            // for TypedefTable
    68 #include "ResolvExpr/AlternativePrinter.h"  // for AlternativePrinter
    6963#include "ResolvExpr/CandidatePrinter.hpp"  // for printCandidates
    7064#include "ResolvExpr/Resolver.h"            // for resolve
    71 #include "SymTab/Validate.h"                // for validate
    72 #include "SymTab/ValidateType.h"            // for linkReferenceToTypes
    7365#include "SynTree/LinkageSpec.h"            // for Spec, Cforall, Intrinsic
    7466#include "SynTree/Declaration.h"            // for Declaration
    75 #include "SynTree/Visitor.h"                // for acceptAll
    7667#include "Tuples/Tuples.h"                  // for expandMemberTuples, expan...
    7768#include "Validate/Autogen.hpp"             // for autogenerateRoutines
     
    330321                Stats::Time::StopBlock();
    331322
    332                 if( useNewAST ) {
    333                         if (Stats::Counters::enabled) {
    334                                 ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New");
    335                                 ast::pass_visitor_stats.max = Stats::Counters::build<Stats::Counters::MaxCounter<double>>("Max depth - New");
    336                         }
    337                         auto transUnit = convert( move( translationUnit ) );
    338 
    339                         forceFillCodeLocations( transUnit );
    340 
    341                         PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) );
    342                         if ( exdeclp ) {
    343                                 dump( move( transUnit ) );
    344                                 return EXIT_SUCCESS;
    345                         }
    346 
    347                         PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) );
    348                         PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) );
    349                         // Hoist Type Decls pulls some declarations out of contexts where
    350                         // locations are not tracked. Perhaps they should be, but for now
    351                         // the full fill solves it.
    352                         forceFillCodeLocations( transUnit );
    353 
    354                         PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) );
    355                         PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) );
    356                         PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) );
    357 
    358                         PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) );
    359 
    360                         PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) );
    361                         PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) );
    362                         PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) );
    363                         PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) );
    364                         PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) );
    365                         PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) );
    366                         PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) );
    367                         PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) );
    368                         PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) );
    369                         PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) );
    370 
    371                         PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) );
    372 
    373                         PASS( "Implement Mutex", Concurrency::implementMutex( transUnit ) );
    374                         PASS( "Implement Thread Start", Concurrency::implementThreadStarter( transUnit ) );
    375                         PASS( "Compound Literal", Validate::handleCompoundLiterals( transUnit ) );
    376                         PASS( "Set Length From Initializer", Validate::setLengthFromInitializer( transUnit ) );
    377                         PASS( "Find Global Decls", Validate::findGlobalDecls( transUnit ) );
    378                         PASS( "Fix Label Address", Validate::fixLabelAddresses( transUnit ) );
    379 
    380                         if ( symtabp ) {
    381                                 return EXIT_SUCCESS;
    382                         } // if
    383 
    384                         if ( expraltp ) {
    385                                 ResolvExpr::printCandidates( transUnit );
    386                                 return EXIT_SUCCESS;
    387                         } // if
    388 
    389                         if ( validp ) {
    390                                 dump( move( transUnit ) );
    391                                 return EXIT_SUCCESS;
    392                         } // if
    393 
    394                         PASS( "Translate Throws", ControlStruct::translateThrows( transUnit ) );
    395                         PASS( "Fix Labels", ControlStruct::fixLabels( transUnit ) );
    396                         PASS( "Fix Names", CodeGen::fixNames( transUnit ) );
    397                         PASS( "Gen Init", InitTweak::genInit( transUnit ) );
    398                         PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( transUnit ) );
    399 
    400                         if ( libcfap ) {
    401                                 // Generate the bodies of cfa library functions.
    402                                 LibCfa::makeLibCfa( transUnit );
    403                         } // if
    404 
    405                         if ( declstatsp ) {
    406                                 printDeclStats( transUnit );
    407                                 return EXIT_SUCCESS;
    408                         } // if
    409 
    410                         if ( bresolvep ) {
    411                                 dump( move( transUnit ) );
    412                                 return EXIT_SUCCESS;
    413                         } // if
    414 
    415                         if ( resolvprotop ) {
    416                                 dumpAsResolverProto( transUnit );
    417                                 return EXIT_SUCCESS;
    418                         } // if
    419 
    420                         PASS( "Resolve", ResolvExpr::resolve( transUnit ) );
    421                         if ( exprp ) {
    422                                 dump( move( transUnit ) );
    423                                 return EXIT_SUCCESS;
    424                         } // if
    425 
    426                         forceFillCodeLocations( transUnit );
    427 
    428                         PASS( "Fix Init", InitTweak::fix(transUnit, buildingLibrary()));
    429 
    430                         // fix ObjectDecl - replaces ConstructorInit nodes
    431                         if ( ctorinitp ) {
    432                                 dump( move( transUnit ) );
    433                                 return EXIT_SUCCESS;
    434                         } // if
    435 
    436                         // Currently not working due to unresolved issues with UniqueExpr
    437                         PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( transUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
    438 
    439                         PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) );
    440                         PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) );
    441 
    442                         // Needs to happen before tuple types are expanded.
    443                         PASS( "Convert Specializations",  GenPoly::convertSpecializations( transUnit ) );
    444 
    445                         PASS( "Expand Tuples", Tuples::expandTuples( transUnit ) );
    446 
    447                         if ( tuplep ) {
    448                                 dump( move( transUnit ) );
    449                                 return EXIT_SUCCESS;
    450                         } // if
    451 
    452                         // Must come after Translate Tries.
    453                         PASS( "Virtual Expand Casts", Virtual::expandCasts( transUnit ) );
    454 
    455                         translationUnit = convert( move( transUnit ) );
    456                 } else {
    457                         PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) );
    458                         if ( exdeclp ) {
    459                                 dump( translationUnit );
    460                                 return EXIT_SUCCESS;
    461                         } // if
    462 
    463                         // add the assignment statement after the initialization of a type parameter
    464                         PASS( "Validate", SymTab::validate( translationUnit ) );
    465 
    466                         if ( symtabp ) {
    467                                 deleteAll( translationUnit );
    468                                 return EXIT_SUCCESS;
    469                         } // if
    470 
    471                         if ( expraltp ) {
    472                                 PassVisitor<ResolvExpr::AlternativePrinter> printer( cout );
    473                                 acceptAll( translationUnit, printer );
    474                                 return EXIT_SUCCESS;
    475                         } // if
    476 
    477                         if ( validp ) {
    478                                 dump( translationUnit );
    479                                 return EXIT_SUCCESS;
    480                         } // if
    481 
    482                         PASS( "Translate Throws", ControlStruct::translateThrows( translationUnit ) );
    483                         PASS( "Fix Labels", ControlStruct::fixLabels( translationUnit ) );
    484                         PASS( "Fix Names", CodeGen::fixNames( translationUnit ) );
    485                         PASS( "Gen Init", InitTweak::genInit( translationUnit ) );
    486                         PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( translationUnit ) );
    487 
    488                         if ( libcfap ) {
    489                                 // Generate the bodies of cfa library functions.
    490                                 LibCfa::makeLibCfa( translationUnit );
    491                         } // if
    492 
    493                         if ( declstatsp ) {
    494                                 CodeTools::printDeclStats( translationUnit );
    495                                 deleteAll( translationUnit );
    496                                 return EXIT_SUCCESS;
    497                         } // if
    498 
    499                         if ( bresolvep ) {
    500                                 dump( translationUnit );
    501                                 return EXIT_SUCCESS;
    502                         } // if
    503 
    504                         CodeTools::fillLocations( translationUnit );
    505 
    506                         if ( resolvprotop ) {
    507                                 CodeTools::dumpAsResolvProto( translationUnit );
    508                                 return EXIT_SUCCESS;
    509                         } // if
    510 
    511                         PASS( "Resolve", ResolvExpr::resolve( translationUnit ) );
    512                         if ( exprp ) {
    513                                 dump( translationUnit );
    514                                 return EXIT_SUCCESS;
    515                         }
    516 
    517                         PASS( "Fix Init", InitTweak::fix( translationUnit, buildingLibrary() ) );
    518 
    519                         // fix ObjectDecl - replaces ConstructorInit nodes
    520                         if ( ctorinitp ) {
    521                                 dump ( translationUnit );
    522                                 return EXIT_SUCCESS;
    523                         } // if
    524 
    525                         PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( translationUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
    526                         PASS( "Translate Tries", ControlStruct::translateTries( translationUnit ) );
    527                         PASS( "Gen Waitfor", Concurrency::generateWaitFor( translationUnit ) );
    528                         PASS( "Convert Specializations",  GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded
    529                         PASS( "Expand Tuples", Tuples::expandTuples( translationUnit ) ); // xxx - is this the right place for this?
    530 
    531                         if ( tuplep ) {
    532                                 dump( translationUnit );
    533                                 return EXIT_SUCCESS;
    534                         } // if
    535 
    536                         PASS( "Virtual Expand Casts", Virtual::expandCasts( translationUnit ) ); // Must come after translateEHM
     323                if (Stats::Counters::enabled) {
     324                        ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New");
     325                        ast::pass_visitor_stats.max = Stats::Counters::build<Stats::Counters::MaxCounter<double>>("Max depth - New");
    537326                }
    538 
    539                 PASS( "Instantiate Generics", GenPoly::instantiateGeneric( translationUnit ) );
     327                auto transUnit = convert( std::move( translationUnit ) );
     328
     329                forceFillCodeLocations( transUnit );
     330
     331                PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) );
     332                if ( exdeclp ) {
     333                        dump( std::move( transUnit ) );
     334                        return EXIT_SUCCESS;
     335                }
     336
     337                PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) );
     338                PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) );
     339                // Hoist Type Decls pulls some declarations out of contexts where
     340                // locations are not tracked. Perhaps they should be, but for now
     341                // the full fill solves it.
     342                forceFillCodeLocations( transUnit );
     343
     344                PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) );
     345                PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) );
     346                PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) );
     347
     348                PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) );
     349
     350                PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) );
     351                PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) );
     352                PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) );
     353                PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) );
     354                PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) );
     355                PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) );
     356                PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) );
     357                PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) );
     358                PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) );
     359                PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) );
     360
     361                PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) );
     362
     363                PASS( "Implement Mutex", Concurrency::implementMutex( transUnit ) );
     364                PASS( "Implement Thread Start", Concurrency::implementThreadStarter( transUnit ) );
     365                PASS( "Compound Literal", Validate::handleCompoundLiterals( transUnit ) );
     366                PASS( "Set Length From Initializer", Validate::setLengthFromInitializer( transUnit ) );
     367                PASS( "Find Global Decls", Validate::findGlobalDecls( transUnit ) );
     368                PASS( "Fix Label Address", Validate::fixLabelAddresses( transUnit ) );
     369
     370                if ( symtabp ) {
     371                        return EXIT_SUCCESS;
     372                } // if
     373
     374                if ( expraltp ) {
     375                        ResolvExpr::printCandidates( transUnit );
     376                        return EXIT_SUCCESS;
     377                } // if
     378
     379                if ( validp ) {
     380                        dump( std::move( transUnit ) );
     381                        return EXIT_SUCCESS;
     382                } // if
     383
     384                PASS( "Translate Throws", ControlStruct::translateThrows( transUnit ) );
     385                PASS( "Fix Labels", ControlStruct::fixLabels( transUnit ) );
     386                PASS( "Fix Names", CodeGen::fixNames( transUnit ) );
     387                PASS( "Gen Init", InitTweak::genInit( transUnit ) );
     388                PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( transUnit ) );
     389
     390                if ( libcfap ) {
     391                        // Generate the bodies of cfa library functions.
     392                        LibCfa::makeLibCfa( transUnit );
     393                } // if
     394
     395                if ( declstatsp ) {
     396                        printDeclStats( transUnit );
     397                        return EXIT_SUCCESS;
     398                } // if
     399
     400                if ( bresolvep ) {
     401                        dump( std::move( transUnit ) );
     402                        return EXIT_SUCCESS;
     403                } // if
     404
     405                if ( resolvprotop ) {
     406                        dumpAsResolverProto( transUnit );
     407                        return EXIT_SUCCESS;
     408                } // if
     409
     410                PASS( "Resolve", ResolvExpr::resolve( transUnit ) );
     411                if ( exprp ) {
     412                        dump( std::move( transUnit ) );
     413                        return EXIT_SUCCESS;
     414                } // if
     415
     416                forceFillCodeLocations( transUnit );
     417
     418                PASS( "Fix Init", InitTweak::fix(transUnit, buildingLibrary()));
     419
     420                // fix ObjectDecl - replaces ConstructorInit nodes
     421                if ( ctorinitp ) {
     422                        dump( std::move( transUnit ) );
     423                        return EXIT_SUCCESS;
     424                } // if
     425
     426                // Currently not working due to unresolved issues with UniqueExpr
     427                PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( transUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
     428
     429                PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) );
     430                PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) );
     431
     432                // Needs to happen before tuple types are expanded.
     433                PASS( "Convert Specializations",  GenPoly::convertSpecializations( transUnit ) );
     434
     435                PASS( "Expand Tuples", Tuples::expandTuples( transUnit ) );
     436
     437                if ( tuplep ) {
     438                        dump( std::move( transUnit ) );
     439                        return EXIT_SUCCESS;
     440                } // if
     441
     442                // Must come after Translate Tries.
     443                PASS( "Virtual Expand Casts", Virtual::expandCasts( transUnit ) );
     444
     445                PASS( "Instantiate Generics", GenPoly::instantiateGeneric( transUnit ) );
     446
     447                translationUnit = convert( std::move( transUnit ) );
     448
    540449                if ( genericsp ) {
    541450                        dump( translationUnit );
     
    620529
    621530
    622 static const char optstring[] = ":c:ghlLmNnpdOAP:S:twW:D:";
     531static const char optstring[] = ":c:ghlLmNnpdP:S:twW:D:";
    623532
    624533enum { PreludeDir = 128 };
     
    634543        { "prototypes", no_argument, nullptr, 'p' },
    635544        { "deterministic-out", no_argument, nullptr, 'd' },
    636         { "old-ast", no_argument, nullptr, 'O'},
    637         { "new-ast", no_argument, nullptr, 'A'},
    638545        { "print", required_argument, nullptr, 'P' },
    639546        { "prelude-dir", required_argument, nullptr, PreludeDir },
     
    657564        "do not generate prelude prototypes => prelude not printed", // -p
    658565        "only print deterministic output",                  // -d
    659         "Use the old-ast",                                                                      // -O
    660         "Use the new-ast",                                                                      // -A
    661566        "print",                                                                                        // -P
    662567        "<directory> prelude directory for debug/nodebug",      // no flag
     
    767672                        deterministic_output = true;
    768673                        break;
    769                   case 'O':                                     // don't print non-deterministic output
    770                         useNewAST = false;
    771                         break;
    772                   case 'A':                                     // don't print non-deterministic output
    773                         useNewAST = true;
    774                         break;
    775674                  case 'P':                                                                             // print options
    776675                        for ( int i = 0;; i += 1 ) {
     
    889788
    890789static void dump( ast::TranslationUnit && transUnit, ostream & out ) {
    891         std::list< Declaration * > translationUnit = convert( move( transUnit ) );
     790        std::list< Declaration * > translationUnit = convert( std::move( transUnit ) );
    892791        dump( translationUnit, out );
    893792}
  • tests/.expect/declarationSpecifier.arm64.txt

    rebf8ca5 r23a08aa0  
    735735}
    736736static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1;
     737_Thread_local signed int _X3x37i_1;
     738__thread signed int _X3x38i_1;
    737739static inline volatile const signed int _X3f11Fi___1();
    738740static inline volatile const signed int _X3f12Fi___1();
  • tests/.expect/declarationSpecifier.x64.txt

    rebf8ca5 r23a08aa0  
    735735}
    736736static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1;
     737_Thread_local signed int _X3x37i_1;
     738__thread signed int _X3x38i_1;
    737739static inline volatile const signed int _X3f11Fi___1();
    738740static inline volatile const signed int _X3f12Fi___1();
  • tests/.expect/declarationSpecifier.x86.txt

    rebf8ca5 r23a08aa0  
    735735}
    736736static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1;
     737_Thread_local signed int _X3x37i_1;
     738__thread signed int _X3x38i_1;
    737739static inline volatile const signed int _X3f11Fi___1();
    738740static inline volatile const signed int _X3f12Fi___1();
  • tests/Makefile.am

    rebf8ca5 r23a08aa0  
    5454
    5555# adjust CC to current flags
    56 CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
     56CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ,$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
    5757CFACC = $(CC)
    5858
     
    6161
    6262# adjusted CC but without the actual distcc call
    63 CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
     63CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ,$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
    6464CFACCLINK = $(CFACCLOCAL) -quiet $(if $(test), 2> $(test), ) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g'))
    6565
  • tests/concurrent/clib.c

    rebf8ca5 r23a08aa0  
    88}
    99
    10 thread_local struct drand48_data buffer = { 0 };
     10_Thread_local struct drand48_data buffer = { 0 };
    1111int myrand() {
    1212        long int result;
  • tests/concurrent/clib_tls.c

    rebf8ca5 r23a08aa0  
    1414
    1515
    16 thread_local int checkval = 0xBAADF00D;
     16__thread int checkval = 0xBAADF00D;
    1717
    1818void init(void * ) {
  • tests/concurrent/park/contention.cfa

    rebf8ca5 r23a08aa0  
    22#include <thread.hfa>
    33
    4 thread_local drand48_data buffer = { 0 };
     4__thread drand48_data buffer = { 0 };
    55int myrand() {
    66        long int result;
  • tests/config.py.in

    rebf8ca5 r23a08aa0  
    99HOSTARCH = "@host_cpu@"
    1010DISTRIBUTE = @HAS_DISTCC@
    11 NEWAST = @DEFAULT_NEW_AST@
  • tests/declarationSpecifier.cfa

    rebf8ca5 r23a08aa0  
    1 // 
     1//
    22// Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
    33//
    44// The contents of this file are covered under the licence agreement in the
    55// file "LICENCE" distributed with Cforall.
    6 // 
    7 // declarationSpecifier.cfa -- 
    8 // 
     6//
     7// declarationSpecifier.cfa --
     8//
    99// Author           : Peter A. Buhr
    1010// Created On       : Wed Aug 17 08:21:04 2016
     
    1212// Last Modified On : Tue Apr 30 18:20:36 2019
    1313// Update Count     : 4
    14 // 
     14//
    1515
    1616typedef short int Int;
     
    5151struct { Int i; } const static volatile x35;
    5252struct { Int i; } const volatile static x36;
     53
     54_Thread_local int x37;
     55__thread int x38;
    5356
    5457static inline const volatile int f11();
  • tests/io/comp_fair.cfa

    rebf8ca5 r23a08aa0  
    2727
    2828struct {
    29       barrier & bar;
    30       int pipe[2];
     29        barrier & bar;
     30        int pipe[2];
    3131
    3232} globals;
     
    6565thread Reader {};
    6666void main(Reader & this) {
    67       bool do_read = has_user_level_blocking( (fptr_t)async_read );
     67        char thrash[1];
     68        bool do_read = has_user_level_blocking( (fptr_t)async_read );
    6869
    69       for(TIMES) {
    70             io_future_t f;
    71             if ( do_read ) {
    72                   char thrash[1];
    73                   async_read(f, globals.pipe[0], thrash, 1, 0);
    74             } else {
    75                   fulfil(f, 0); // If we don't have user-level blocking just play along
    76             }
     70        for(TIMES) {
     71                io_future_t f;
     72                if ( do_read ) {
     73                        async_read(f, globals.pipe[0], thrash, 1, 0);
     74                } else {
     75                        fulfil(f, 0); // If we don't have user-level blocking just play along
     76                }
    7777
    78             block( globals.bar );
     78                block( globals.bar );
    7979
    8080                yield( prng( this, 15 ) );
    8181
    82             unsigned i = __atomic_add_fetch( &counter, 1, __ATOMIC_SEQ_CST );
     82                unsigned i = __atomic_add_fetch( &counter, 1, __ATOMIC_SEQ_CST );
    8383                if(0 == (i % 100)) sout | i;
    8484
    85             wait( f );
     85                wait( f );
    8686
    87             if(f.result < 0)
    88                   abort | "Read error" | -f.result | ":" | strerror(-f.result);
     87                if(f.result < 0)
     88                        abort | "Read error" | -f.result | ":" | strerror(-f.result);
    8989
    90             block( globals.bar );
    91       }
     90                block( globals.bar );
     91        }
    9292}
    9393
     
    9797thread Writer {};
    9898void main(Writer & this) {
    99       for(TIMES) {
    100             block( globals.bar );
     99        for(TIMES) {
     100                block( globals.bar );
    101101
    102             sleep( 1`us );
     102                sleep( 1`us );
    103103
    104             char buf[1] = { '+' };
    105             int ret = write( globals.pipe[1], buf, 1 );
    106             if(ret < 0)
    107                   abort | "Write error" | errno | ":" | strerror(errno);
     104                char buf[1] = { '+' };
     105                int ret = write( globals.pipe[1], buf, 1 );
     106                if(ret < 0)
     107                        abort | "Write error" | errno | ":" | strerror(errno);
    108108
    109             block( globals.bar );
    110       }
     109                block( globals.bar );
     110        }
    111111}
    112112
     
    122122
    123123int main() {
    124       barrier bar = { 2 };
    125       &globals.bar = &bar;
    126       int ret = pipe(globals.pipe);
    127       if(ret != 0)
    128             abort | "Pipe error" | errno | ":" | strerror(errno);
     124        barrier bar = { 2 };
     125        &globals.bar = &bar;
     126        int ret = pipe(globals.pipe);
     127        if(ret != 0)
     128                abort | "Pipe error" | errno | ":" | strerror(errno);
    129129
    130130        processor p;
     
    134134                Spinner s;
    135135                Reader ior;
    136             Writer iow;
     136                Writer iow;
    137137        }
    138138        sout | "done";
  • tests/meta/.expect/arch.arm64.txt

    rebf8ca5 r23a08aa0  
    1 meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
     1meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression
    22Explicit Cast of:
    33  Name: FA64
  • tests/meta/.expect/arch.x64.txt

    rebf8ca5 r23a08aa0  
    1 meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
     1meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression
    22Explicit Cast of:
    33  Name: FX64
  • tests/meta/.expect/arch.x86.txt

    rebf8ca5 r23a08aa0  
    1 meta/archVast.cfa:28:1 error: Cannot choose between 3 alternatives for expression
     1meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression
    22Explicit Cast of:
    33  Name: FX86
  • tests/meta/arch.cfa

    rebf8ca5 r23a08aa0  
    55// file "LICENCE" distributed with Cforall.
    66//
    7 // archVast.cfa -- Check if all combinations are of ast/arch are properly distinguished
     7// arch.cfa -- Check if all architectures are properly distinguished by the test suite
    88//
    99// Author           : Thierry Delisle
  • tests/pybin/settings.py

    rebf8ca5 r23a08aa0  
    9797                self.path   = "debug" if value else "nodebug"
    9898
    99 class AST:
    100         def __init__(self, ast):
    101                 if ast == "new":
    102                         self.target = ast
    103                         self.string = "New AST"
    104                         self.flags  = """AST_FLAGS=-XCFA,--new-ast"""
    105                 elif ast == "old":
    106                         self.target = ast
    107                         self.string = "Old AST"
    108                         self.flags  = """AST_FLAGS=-XCFA,--old-ast"""
    109                 elif ast == None:
    110                         self.target = "new" if config.NEWAST else "old"
    111                         self.string = "Default AST (%s)" % self.target
    112                         self.flags  = """AST_FLAGS="""
    113                 else:
    114                         print("""ERROR: Invalid ast configuration, must be "old", "new" or left unspecified, was %s""" % (value), file=sys.stderr)
    115                         sys.exit(1)
    116 
    117         def filter(self, tests):
    118 
    119                 return [test for test in tests if not test.astv or self.target == test.astv]
    120 
    12199class Install:
    122100        def __init__(self, value):
     
    141119
    142120def init( options ):
    143         global all_ast
    144121        global all_arch
    145122        global all_debug
    146123        global all_install
    147         global ast
    148124        global arch
    149125        global debug
     
    160136        global timeout2gdb
    161137
    162         all_ast      = [AST(o)          for o in list(dict.fromkeys(options.ast    ))] if options.ast  else [AST(None)]
    163138        all_arch     = [Architecture(o) for o in list(dict.fromkeys(options.arch   ))] if options.arch else [Architecture(None)]
    164139        all_debug    = [Debug(o)        for o in list(dict.fromkeys(options.debug  ))]
  • tests/pybin/test_run.py

    rebf8ca5 r23a08aa0  
    1111                self.path = ''
    1212                self.arch = ''
    13                 self.astv = ''
    1413
    1514        def toString(self):
    16                 return "{:25s} ({:5s} arch, {:s} ast: {:s})".format( self.name, self.arch if self.arch else "Any", self.astv if self.astv else "Any", self.target() )
     15                return "{:25s} ({:5s} arch: {:s})".format( self.name, self.arch if self.arch else "Any", self.target() )
    1716
    1817        def prepare(self):
     
    2221        def expect(self):
    2322                arch = '' if not self.arch else ".%s" % self.arch
    24                 astv = '' if not self.astv else ".nast" if self.astv == "new" else ".oast"
    25                 return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s%s.txt" % (self.name,astv,arch)) )
     23                return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s.txt" % (self.name,arch)) )
    2624
    2725        def error_log(self):
     
    5856
    5957        @staticmethod
    60         def new_target(target, arch, astv):
     58        def new_target(target, arch):
    6159                test = Test()
    6260                test.name = os.path.basename(target)
    6361                test.path = os.path.relpath (os.path.dirname(target), settings.SRCDIR)
    6462                test.arch = arch.target if arch else ''
    65                 test.astv = astv.target if astv else ''
    6663                return test
    6764
  • tests/pybin/tools.py

    rebf8ca5 r23a08aa0  
    182182                '-s' if silent else None,
    183183                test_param,
    184                 settings.ast.flags,
    185184                settings.arch.flags,
    186185                settings.debug.flags,
  • tests/quotedKeyword.cfa

    rebf8ca5 r23a08aa0  
    3131        ``__int128, ``__label__, ``long, ``lvalue, ``_Noreturn, ``__builtin_offsetof, ``otype, ``register, ``restrict,
    3232        ``__restrict, ``__restrict__, ``return, ``short, ``signed, ``__signed, ``__signed__, ``sizeof, ``static,
    33         ``_Static_assert, ``struct, ``switch, ``_Thread_local, ``throw, ``throwResume, ``trait, ``try, ``typedef,
     33        ``_Static_assert, ``struct, ``switch, ``_thread, ``_Thread_local, ``throw, ``throwResume, ``trait, ``try, ``typedef,
    3434        ``typeof, ``__typeof, ``__typeof__, ``union, ``unsigned, ``__builtin_va_list, ``void, ``volatile, ``__volatile,
    3535        ``__volatile__, ``while;
  • tests/test.py

    rebf8ca5 r23a08aa0  
    2323
    2424        def match_test(path):
    25                 match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.nast|\.oast)?(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
     25                match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
    2626                if match :
    2727                        test = Test()
    2828                        test.name = match.group(2)
    2929                        test.path = match.group(1)
    30                         test.arch = match.group(4)[1:] if match.group(4) else None
    31 
    32                         astv = match.group(3)[1:] if match.group(3) else None
    33                         if astv == 'oast':
    34                                 test.astv = 'old'
    35                         elif astv == 'nast':
    36                                 test.astv = 'new'
    37                         elif astv:
    38                                 print('ERROR: "%s", expect file has astv but it is not "nast" or "oast"' % testname, file=sys.stderr)
    39                                 sys.exit(1)
     30                        test.arch = match.group(3)[1:] if match.group(3) else None
    4031
    4132                        expected.append(test)
     
    8172                                # this is a valid name, let's check if it already exists
    8273                                found = [test for test in all_tests if canonical_path( test.target() ) == testname]
    83                                 setup = itertools.product(settings.all_arch if options.arch else [None], settings.all_ast if options.ast else [None])
     74                                setup = itertools.product(settings.all_arch if options.arch else [None])
    8475                                if not found:
    85                                         # it's a new name, create it according to the name and specified architecture/ast version
    86                                         tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
     76                                        # it's a new name, create it according to the name and specified architecture
     77                                        tests.extend( [Test.new_target(testname, arch) for arch in setup] )
    8778                                elif len(found) == 1 and not found[0].arch:
    8879                                        # we found a single test, the user better be wanting to create a cross platform test
    8980                                        if options.arch:
    9081                                                print('ERROR: "%s", test has no specified architecture but --arch was specified, ignoring it' % testname, file=sys.stderr)
    91                                         elif options.ast:
    92                                                 print('ERROR: "%s", test has no specified ast version but --ast was specified, ignoring it' % testname, file=sys.stderr)
    9382                                        else:
    9483                                                tests.append( found[0] )
    9584                                else:
    9685                                        # this test is already cross platform, just add a test for each platform the user asked
    97                                         tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
     86                                        tests.extend( [Test.new_target(testname, arch) for arch in setup] )
    9887
    9988                                        # print a warning if it users didn't ask for a specific architecture
     
    10291                                                print('WARNING: "%s", test has architecture specific expected files but --arch was not specified, regenerating only for current host' % testname, file=sys.stderr)
    10392
    104 
    105                                         # print a warning if it users didn't ask for a specific ast version
    106                                         found_astv = [f.astv for f in found if f.astv]
    107                                         if found_astv and not options.ast:
    108                                                 print('WARNING: "%s", test has ast version specific expected files but --ast was not specified, regenerating only for current ast' % testname, file=sys.stderr)
    109 
    11093                        else :
    11194                                print('ERROR: "%s", tests are not allowed to end with a C/C++/CFA extension, ignoring it' % testname, file=sys.stderr)
     
    127110        # create a parser with the arguments for the tests script
    128111        parser = argparse.ArgumentParser(description='Script which runs cforall tests')
    129         parser.add_argument('--ast', help='Test for specific ast', type=comma_separated(str), default=None)
    130112        parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None)
    131113        parser.add_argument('--debug', help='Run all tests in debug or release', type=comma_separated(yes_no), default='yes')
     
    351333
    352334                # print the possible options
    353                 print("-h --help --debug --dry-run --list --ast=new --ast=old --arch --all --regenerate-expected --archive-errors --install --timeout --global-timeout --timeout-with-gdb -j --jobs -I --include -E --exclude --continue ", end='')
     335                print("-h --help --debug --dry-run --list --arch --all --regenerate-expected --archive-errors --install --timeout --global-timeout --timeout-with-gdb -j --jobs -I --include -E --exclude --continue ", end='')
    354336                print(" ".join(map(lambda t: "%s" % (t.target()), tests)))
    355337
     
    422404        # for each build configurations, run the test
    423405        with Timed() as total_dur:
    424                 for ast, arch, debug, install in itertools.product(settings.all_ast, settings.all_arch, settings.all_debug, settings.all_install):
    425                         settings.ast     = ast
     406                for arch, debug, install in itertools.product(settings.all_arch, settings.all_debug, settings.all_install):
    426407                        settings.arch    = arch
    427408                        settings.debug   = debug
     
    430411                        # filter out the tests for a different architecture
    431412                        # tests are the same across debug/install
    432                         local_tests = settings.ast.filter( tests )
    433                         local_tests = settings.arch.filter( local_tests )
     413                        local_tests = settings.arch.filter( tests )
    434414
    435415                        # check the build configuration works
     
    438418
    439419                        # print configuration
    440                         print('%s %i tests on %i cores (%s:%s - %s)' % (
     420                        print('%s %i tests on %i cores (%s - %s)' % (
    441421                                'Regenerating' if settings.generating else 'Running',
    442422                                len(local_tests),
    443423                                jobs,
    444                                 settings.ast.string,
    445424                                settings.arch.string,
    446425                                settings.debug.string
Note: See TracChangeset for help on using the changeset viewer.