Changeset 23a08aa0

Jenkins/FullBuild

-              rebf8ca5
+              r23a08aa0
                                 parallel (
+                                        // gcc_08_x86_new: { trigger_build( 'gcc-8',   'x86' ) },
+                                        // gcc_07_x86_new: { trigger_build( 'gcc-7',   'x86' ) },
+                                        // gcc_06_x86_new: { trigger_build( 'gcc-6',   'x86' ) },
+                                        gcc_08_x86_new: { trigger_build( 'gcc-10',  'x86' ) },
+                                        gcc_07_x86_new: { trigger_build( 'gcc-9',   'x86' ) },
                                         gcc_10_x64_new: { trigger_build( 'gcc-10',  'x64' ) },
                                         gcc_09_x64_new: { trigger_build( 'gcc-9',   'x64' ) },

Jenkinsfile

-              rebf8ca5
+              r23a08aa0
                 sh 'ulimit -a'
+                Tools.BuildStage('Test: short', !Settings.RunAllTests) {
+                jopt = '-j $(nproc)'
+                Tools.BuildStage('Test: Debug', true) {
                         dir (BuildDir) {
                                 //Run the tests from the tests directory
                                 sh "make --no-print-directory -C tests archiveerrors=${BuildDir}/tests/crashes/short"
+                                sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=yes archiveerrors=${BuildDir}/tests/crashes/full-debug"""
+                        }
+                }
                 Tools.BuildStage('Test: full', Settings.RunAllTests) {
+                Tools.BuildStage('Test: Release', Settings.RunAllTests) {
                         dir (BuildDir) {
+                                        jopt = '-j $(nproc)'
+                                        if( Settings.Architecture.node == 'x86' ) {
+                                                jopt = '-j2'
+                                        }
+                                        //Run the tests from the tests directory
+                                        sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=yes archiveerrors=${BuildDir}/tests/crashes/full-debug"""
+                                        sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=no  archiveerrors=${BuildDir}/tests/crashes/full-nodebug"""
+                                //Run the tests from the tests directory
+                                sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=no  archiveerrors=${BuildDir}/tests/crashes/full-nodebug"""
+                        }
+                }
 …
                                 ],                                                                                              \
                                 [$class: 'BooleanParameterDefinition',                                                  \
                                         description: 'If false, only the quick test suite is ran',              \
+                                        description: 'If false, the test suite is only ran in debug',   \
                                         name: 'RunAllTests',                                                            \
                                         defaultValue: false,                                                            \

benchmark/basic/tls_fetch_add.c

-              rebf8ca5
+              r23a08aa0
 // thread_local Boolean. This means the entire protocol is just to "mov" instructions making it extremely cheap.
+#define thread_local _Thread_local
+thread_local volatile bool value;
+__thread volatile bool value;
 void __attribute__((noinline)) do_call() {

benchmark/io/http/worker.cfa

-              rebf8ca5
+              r23a08aa0
                 if( options.log ) mutex(sout) sout | "=== Accepting connection ===";
                 int fd = cfa_accept4( this.sockfd, this.[addr, addrlen, flags], CFA_IO_LAZY );
                 if(fd < 0) {
+                if(fd <= 0) {
                         if( errno == ECONNABORTED ) break;
                         if( this.done && (errno == EINVAL || errno == EBADF) ) break;
                         abort( "accept error: (%d) %s\n", (int)errno, strerror(errno) );
+                        abort( "accept error %d: (%d) %s\n", fd, (int)errno, strerror(errno) );
+                }
                 if(this.done) break;

configure.ac

-              rebf8ca5
+              r23a08aa0
 #Trasforming cc1 will break compilation
 M4CFA_PROGRAM_NAME
-#==============================================================================
-# New AST toggling support
-AH_TEMPLATE([CFA_USE_NEW_AST],[Sets whether or not to use the new-ast, this is adefault value and can be overrided by --old-ast and --new-ast])
-DEFAULT_NEW_AST="True"
-AC_ARG_ENABLE(new-ast,
-        [  --enable-new-ast     whether or not to use new ast as the default AST algorithm],
-        [case "${enableval}" in
-                yes) newast=true ; DEFAULT_NEW_AST="True"  ;;
-                no)  newast=false; DEFAULT_NEW_AST="False" ;;
-                *) AC_MSG_ERROR([bad value ${enableval} for --enable-new-ast]) ;;
-        esac],[newast=true])
-AC_DEFINE_UNQUOTED([CFA_USE_NEW_AST], $newast)
-AC_SUBST(DEFAULT_NEW_AST)
 #==============================================================================
 …
                 \'--enable-gprofiler=*) ;;
                 \'--disable-gprofiler) ;;
-                # skip the target hosts
-                \'--enable-new-ast=*) ;;
-                \'--disable-new-ast) ;;
                 # skip this, it only causes problems

doc/LaTeXmacros/lstlang.sty

rebf8ca5	r23a08aa0
118	118	inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or,
119	119	otype, restrict, __restrict, __restrict__, recover, report, __signed, __signed__, _Static_assert, suspend,
120		thread, _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,
	120	thread, __thread, _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,
121	121	virtual, __volatile, __volatile__, waitfor, when, with, zero_t,
122	122	},

doc/bibliography/pl.bib

-              rebf8ca5
+              r23a08aa0
     series      = {Innovative Technology},
     year        = 1991,
+}
+@mastersthesis{Zulfiqar22,
+    keywords    = {Cforall, memory allocation, threading},
+    contributer = {pabuhr@plg},
+    author      = {Mubeen Zulfiqar},
+    title       = {High-Performance Concurrent Memory Allocation},
+    school      = {School of Computer Science, University of Waterloo},
+    year        = 2022,
+    address     = {Waterloo, Ontario, Canada, N2L 3G1},
+    note        = {\href{https://uwspace.uwaterloo.ca/handle/10012/18329}{https://\-uwspace.uwaterloo.ca/\-handle/\-10012/18329}},
+}

doc/proposals/iterators.md

-              rebf8ca5
+              r23a08aa0
 returns a range object, which can be used as any other type.
+It might not cover every single case with the same syntax (the `@` syntax may
+not translate to operators very well), but should be able to maintain every
+option with some library range.
 Library Enhancements
 --------------------
 …
 ------------
 Python has a robust iterator tool set. It also has a `range` built-in which
+does many of the same things as the special for loops.
+does many of the same things as the special for loops (the finite and
+half-open ranges).
+In addition, it has many dedicated iterator constructors and transformers,
+and many containers can both produce and be constructed from iterators.
 +   https://docs.python.org/3/reference/datamodel.html#object.__iter__
 +   https://docs.python.org/3/library/functions.html#func-range
 C++ has many iterator tools at well, except for the fact it's `iterators` are
+C++ has many iterator tools at well, except for the fact it's "iterators" are
 not what are usually called iterators (as above) but rather an abstraction of
+pointers.
+pointers. The notable missing feature is that a single iterator has no
+concept of being empty or not, instead it must be compared to the end
+iterator.
+However, C++ ranges have an interface much more similar to iterators.
+They do appear to be a wrapper around the "pointer" iterators.
++   https://en.cppreference.com/w/cpp/ranges
 Rust also has a imperative implementation of a functional style of iterators,

doc/theses/thierry_delisle_PhD/.gitignore

rebf8ca5	r23a08aa0
20	20	thesis/fig/*.fig.bak
21	21	thesis/thesis.pdf
	22	thesis/thesis.tty
22	23	thesis/thesis.ps
23	24

doc/theses/thierry_delisle_PhD/thesis/Makefile

-              rebf8ca5
+              r23a08aa0
 LaTeX  = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error -output-directory=${Build}
 BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex
+DeTeX = TEXINPUTS=${TeXLIB} && export TEXINPUTS && detex -r
 MAKEFLAGS = --no-print-directory # --silent
 …
         ${LaTeX} $<
+%.tty: build/%.dvi
+        dvi2tty -w132 $< > $@
 ## Define the default recipes.
 …
 churn_jax_ops_FLAGS = --MaxY=50000000
 churn_low_jax_ops_FLAGS = --MaxY=50000000
 churn_jax_ns_FLAGS = --MaxY=20000
 churn_low_jax_ns_FLAGS = --MaxY=20000
+churn_jax_ns_FLAGS = --MaxY=10000
+churn_low_jax_ns_FLAGS = --MaxY=10000
 churn_nasus_ops_FLAGS = --MaxY=75000000
 churn_low_nasus_ops_FLAGS = --MaxY=75000000
+churn_nasus_ns_FLAGS = --MaxY=20000
+churn_low_nasus_ns_FLAGS = --MaxY=20000
+churn_nasus_ns_FLAGS = --MaxY=5000
+churn_low_nasus_ns_FLAGS = --MaxY=5000
+locality_share_jax_ops_FLAGS = --MaxY=40000000
+locality_noshare_jax_ops_FLAGS = --MaxY=40000000
+locality_share_jax_ns_FLAGS = --MaxY=10000
+locality_noshare_jax_ns_FLAGS = --MaxY=10000
+locality_share_nasus_ops_FLAGS = --MaxY=60000000
+locality_noshare_nasus_ops_FLAGS = --MaxY=60000000
+locality_share_nasus_ns_FLAGS = --MaxY=10000
+locality_noshare_nasus_ns_FLAGS = --MaxY=10000
 build/result.%.ns.svg : data/% Makefile ../../../../benchmark/plot.py | ${Build}

doc/theses/thierry_delisle_PhD/thesis/glossary.tex

-              rebf8ca5
+              r23a08aa0
 % Definitions
 \longnewglossaryentry{thrd}
 {name={thread}}
+\longnewglossaryentry{at}
+{name={Thread},text={thread}}
+{
 Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
+A thread is an independent sequential execution path through a program. Each thread is scheduled for execution separately and independently from other threads. Systems offer one or more concrete implementations of this concept, \eg \gls{kthrd}, \gls{job}, task. However, most of the concepts of scheduling are independent of the particular implementations of the thread representation. For this reason, this document uses the term \gls{at} to mean any of these representation that meets the general definition.
 \textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+\textit{Synonyms : Tasks, Jobs, Blocks.}
+}
 \longnewglossaryentry{proc}
 {name={processor}}
+{name={Processor},text={processor}}
+{
+Entity that executes a \gls{at}, \ie the resource being scheduled by the scheduler. In kernel-level threading, \ats are kernel threads and \procs are the \glspl{hthrd} on which the kernel threads are scheduled. In user-level threading and thread pools, \procs are kernel threads.
+\textit{Synonyms : Server, Worker.}
+}
 \longnewglossaryentry{rQ}
 {name={ready-queue}}
+{name={Ready Queue}, text={ready-queue}}
+{
+Data structure holding \ats that are ready to \glslink{atrun}{run}. Often a \glsxtrshort{fifo} queue for fairness, but can take many different forms, \eg binary tree and priority queue are also common.
+}
 \longnewglossaryentry{uthrding}
 {name={user-level threading}}
+{name={User-Level Threading},text={user-level threading}}
+{
+Threading model where a scheduler runs in users space and maps threads managed and created inside the user-space onto \glspl{kthrd}.
 \textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
 …
 \longnewglossaryentry{rmr}
 {name={remote memory reference}}
+{name={Remote Memory Reference},text={remote memory reference}}
+{
+A memory reference to an address not in the current \gls{hthrd}'s cache is a remote reference. Memory references that \emph{are} in the current \gls{hthrd}'s cache is a \newterm{local} memory reference. For example, a cache line that must be updated from the any cache on another socket, or from RAM in a \glsxtrshort{numa} context.
+}
 …
 \longnewglossaryentry{hthrd}
 {name={hardware thread}}
+{name={Hardware Threading},text={hardware thread}}
+{
 Threads representing the underlying hardware directly, \eg the CPU core, or hyper-thread if the hardware supports multiple threads of execution per core. The number of hardware threads is considered to be always fixed to a specific number determined by the hardware.
+Threads representing the underlying hardware, \eg a CPU core or hyper-thread, if the hardware supports multiple threads of execution per core. The number of hardware threads present is fixed on any given computer.
 \textit{Synonyms : }
+\textit{Synonyms : Core, Hyper-Thread, Processing Unit, CPU.}
+}
 \longnewglossaryentry{kthrd}
 {name={kernel-level thread}}
+{name={Kernel-Level Thread},text={kernel-level thread}}
+{
 Threads created and managed inside kernel-space. Each thread has its own stack and its own thread of execution. Kernel-level threads are owned, managed and scheduled by the underlying operating system.
+Threads created and managed inside kernel space. Each kernel thread has its own stack and its own thread of execution. Kernel-level threads are owned, managed and scheduled by the underlying operating system.
 \textit{Synonyms : OS threads, Hardware threads, Physical threads.}
 …
 \longnewglossaryentry{fiber}
 {name={fiber}}
+{name={Fiber},text={fiber}}
+{
 Fibers are non-preemptive user-level threads. They share most of the caracteristics of user-level threads except that they cannot be preempted by another fiber.
+Fibers are non-preemptive user-level threads. They share most of the characteristics of user-level threads except that they cannot be preempted by another fiber.
 \textit{Synonyms : Tasks.}
 …
 \longnewglossaryentry{job}
 {name={job}}
+{name={Job},text={job}}
+{
 Unit of work, often sent to a thread pool or worker pool to be executed. Has neither its own stack nor its own thread of execution.
 …
 \longnewglossaryentry{pool}
 {name={thread-pool}}
+{name={Thread Pool},text={thread-pool}}
+{
 Group of homogeneuous threads that loop executing units of works after another.
+Group of homogeneous threads that loop executing units of works. Often executing \glspl{jobs}.
 \textit{Synonyms : }
+\textit{Synonyms : Executor.}
+}
 \longnewglossaryentry{preemption}
 {name={preemption}}
+{name={Preemption},text={preemption}}
+{
 Involuntary context switch imposed on threads at a given rate.
 …
+}
-\longnewglossaryentry{at}
-{name={task}}
+{
-Abstract object representing an unit of work. Systems will offer one or more concrete implementations of this concept (\eg \gls{kthrd}, \gls{job}), however, most of the concept of schedulings are independent of the particular implementations of the work representation. For this reason, this document use the term \Gls{at} to mean any representation and not one in particular.
+}
 \longnewglossaryentry{atsched}
 {name={Scheduling a \gls{at}}}
+{
 Scheduling an \gls{at} refers to the act of notifying the scheduler that a task is ready to be ran. When representing the scheduler as a queue of tasks, scheduling is the act of pushing a task onto the end of the queue. This doesn't necesserily means the task will ever be allocated CPU time (\gls{atrun}), for example, if the system terminates abruptly, scheduled \glspl{at} will probably never run.
+Scheduling a \at refers to notifying the scheduler that a \at is ready to run. When representing the scheduler as a queue of \ats, scheduling is the act of pushing a \at onto the end of the queue. This operation does not necessarily mean the \at is guaranteed CPU time (\gls{atrun}), \eg if the program terminates abruptly, scheduled \glspl{at} never run.
 \textit{Synonyms : None.}
+\textit{Synonyms : Unparking.}
+}
 …
 {name={Running a \gls{at}}}
+{
 Running an \gls{at} refers to the act of allocating CPU time to a task that is ready to run. When representing the scheduler as a queue of tasks, running is the act of poping a task from the front of the queue and putting it onto a \gls{proc}. The \gls{at} can than accomplish some or all of the work it is programmed to do.
+Running a \at refers to allocating CPU time to a \at that is ready to run. When representing the scheduler as a queue of \ats, running is the act of popping a \at from the front of the queue and putting it onto a \gls{proc}. The \gls{at} can then accomplish some or all of the work it is programmed to do.
 \textit{Synonyms : None.}
 …
 \longnewglossaryentry{atmig}
 {name={migration of \gls{at}}}
+{name={\Glspl{at} Migration}}
+{
 Migration refers to the idea of an \gls{at} running on a different worker/processor than the last time it was run. It is generally preferable to minimise migration as it incurs cost but any load balancing among workers requires some amount of migration.
+Migration refers to the idea of an \gls{at} running on a different \proc than the last time it was run. It is generally preferable to minimize migration as it incurs cost but any load balancing among \proc requires some amount of migration.
 \textit{Synonyms : None.}
 …
 \longnewglossaryentry{atpass}
 {name={overtaking \gls{at}}}
+{name={Overtaking \gls{at}}}
+{
 When representing the scheduler as a queue of \glspl{at}, overtaking is the act breaking the FIFO-ness of the queue by moving a \gls{at} in front of some other \gls{at} when it arrived after. This remains true for schedulers that do not use a FIFO queue, when the order in which the \glspl{at} are \glslink{atsched}{scheduled} and \glslink{atrun}{run} in a different order. A \gls{at} is said to \emph{overtake} another if it is run \emph{before} but was \emph{scheduled} after the other \gls{at}.
 …
 \longnewglossaryentry{atblock}
 {name={Blocking an \gls{at}}}
+{name={\Gls{at} Blocking}}
+{
 Blocking an abstract task refers to the act of taking a task that us running on a CPU off the CPU. Unless no other task is ready, this action is generally immediately followed by running an other task.
+\Gls{at} blocking means taking a running \at off a CPU. Unless no other \at is ready, this action is immediately followed by running another \at.
 \textit{Synonyms : None.}
+\textit{Synonyms : Parking.}
+}
 …
 {name={Running to completion}}
+{
 Running to completion refers to the entire sequence of : being scheduled, running and blocking, for a given task.
+Running to completion refers to the entire sequence of : being scheduled, running and blocking, for a given \at.
 See also \gls{atsched}, \gls{atrun}, \gls{atblock}
 …
 \longnewglossaryentry{load}
 {name={System Load}}
+{name={System Load},text={load}}
+{
+The load is refers to the rate at which \glspl{at} are \glslink{atsched}{scheduled} versus the rate at which they are \glslink{atrun}{run}. When \glspl{at} are being scheduled faster than they are run, the system is considered \emph{overloaded}. When \glspl{at} are being run faster than they are scheduled, the system is considered \emph{underloaded}. Conrrespondingly, if both rates are equal, the system is considered \emph{loaded}. Note that the system is considered loaded only of the rate at which \glspl{at} are scheduled/run is non-zero, otherwise the system is empty, it has no load.
+The system load refers to the rate at which \glspl{at} are \glslink{atsched}{scheduled} versus the rate at which they are \glslink{atrun}{run}. When \glspl{at} are being scheduled faster than they are run, the system is considered \emph{overloaded}. When \glspl{at} are being run faster than they are scheduled, the system is considered \emph{underloaded}. Correspondingly, if both rates are equal, the system is considered \emph{loaded}. Note the system is considered loaded only if the rate at which \glspl{at} are scheduled/run is non-zero, otherwise the system is empty, \ie it has no load.
+\textit{Synonyms : CPU Load, System Load.}
+}

doc/theses/thierry_delisle_PhD/thesis/local.bib

-              rebf8ca5
+              r23a08aa0
+}
+% Trevor's relaxed FIFO list
+@inproceedings{alistarh2018relaxed,
+  title={Relaxed schedulers can efficiently parallelize iterative algorithms},
+  author={Alistarh, Dan and Brown, Trevor and Kopinsky, Justin and Nadiradze, Giorgi},
+  booktitle={Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing},
+  pages={377--386},
+  year={2018}
+}
+@article{zhuravlev2012survey,
+  title={Survey of energy-cognizant scheduling techniques},
+  author={Zhuravlev, Sergey and Saez, Juan Carlos and Blagodurov, Sergey and Fedorova, Alexandra and Prieto, Manuel},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  volume={24},
+  number={7},
+  pages={1447--1464},
+  year={2012},
+  publisher={IEEE}
+}
+@article{vikranth2013topology,
+  title={Topology aware task stealing for on-chip NUMA multi-core processors},
+  author={Vikranth, BRWACRR and Wankar, Rajeev and Rao, C Raghavendra},
+  journal={Procedia Computer Science},
+  volume={18},
+  pages={379--388},
+  year={2013},
+  publisher={Elsevier}
+}
+@inproceedings{min2011hierarchical,
+  title={Hierarchical work stealing on manycore clusters},
+  author={Min, Seung-Jai and Iancu, Costin and Yelick, Katherine},
+  booktitle={Fifth Conference on Partitioned Global Address Space Programming Models (PGAS11)},
+  volume={625},
+  year={2011},
+  organization={Citeseer}
+}
+@article{ribic2014energy,
+  title={Energy-efficient work-stealing language runtimes},
+  author={Ribic, Haris and Liu, Yu David},
+  journal={ACM SIGARCH Computer Architecture News},
+  volume={42},
+  number={1},
+  pages={513--528},
+  year={2014},
+  publisher={ACM New York, NY, USA}
+}
+@inproceedings{torng2016asymmetry,
+  title={Asymmetry-aware work-stealing runtimes},
+  author={Torng, Christopher and Wang, Moyang and Batten, Christopher},
+  booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)},
+  pages={40--52},
+  year={2016},
+  organization={IEEE}
+}
 % --------------------------------------------------
 …
   title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide},
   organization = {Apple Inc.},
+  howPublish = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}}
+  note = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://\-developer.apple.com/\-library/archive/\-documentation/\-Darwin/\-Conceptual/\-KernelProgramming/\-scheduler/\-scheduler.html}}
+}
+@misc{MemcachedThreading,
+  author = {Oracle},
+  title = {MySQL 5.6 Reference Manual Including MySQL NDB Cluster 7.3-7.4 Reference Guide},
+  howpublished = {\href{https://docs.oracle.com/cd/E17952_01/mysql-5.6-en/ha-memcached-using-threads.html}{https://docs.oracle.com/\-cd/E17952\_01/\-mysql-5.6-en/\-ha-memcached-using-threads.html}},
+  note = "[Online; accessed 5-August-2022]"
+}
 …
+}
+@misc{GITHUB:SchedulingBenchmarks,
+  title = {Scheduling Benchmarks},
+  author = {Thierry Delisle},
+  howpublished = {\href{https://github.com/cforall/SchedulingBenchmarks_PhD22}{https://\-github.com/\-cforall/\-SchedulingBenchmarks\_\-PhD22}},
+}
 % --------------------------------------------------
 % Tech documents
 …
+}
+@manual{MAN:eventfd,
+  key        = "eventfd",
+  title      = "eventfd(2) Linux User's Manual",
+  year       = "2019",
+  month      = "MArch",
+}
 @manual{MAN:aio,
   key        = "aio",
 …
   year       = "2019",
   month      = "March",
+}
+@manual{MAN:bash,
+  title   = {Bash Reference Manual},
+  author  = {Chet Ramey and Brian Fox},
+  year    = "2020",
+  month   = "December",
+  version = {5,1},
+  howpublished = {\href{https://www.gnu.org/software/bash/manual/bash.pdf}{https://\-www.gnu.org/\-software/\-bash/\-manual/\-bash.pdf}}
+}
 …
+}
 % --------------------------------------------------
 % Wikipedia Entries
 …
   howpublished = "\href{https://en.wikipedia.org/wiki/Zipf%27s_law}{https://\-en.wikipedia.org/\-wiki/\-Zipf\%27s\-\_law}",
   note = "[Online; accessed 5-August-2022]"
+}
+@misc{wiki:htm,
+  author = "{Wikipedia contributors}",
+  title = "Transactional memory --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2022",
+  howpublished = "\href{https://en.wikipedia.org/wiki/Zipf%27s_law}{https://\-en.wikipedia.org/\-wiki/\-Zipf\%27s\-\_law}",
+  note = "[Online; accessed 7-September-2022]"
+}
 …
   note = "[Online; accessed 5-August-2022]"
+}
+@article{reese2008nginx,
+    title       = {NGINX: the high-performance web server and reverse proxy},
+    author      = {Reese, Will},
+    journal     = {Linux Journal},
+    volume      = {2008},
+    number      = {173},
+    pages       = {2},
+    year        = {2008},
+    publisher   = {Belltown Media}
+}
+@phdthesis{Harji10,
+    author      = {Ashif Harji},
+    title       = {Performance Comparison of Uniprocessor and Multiprocessor Web Server Architectures},
+    school      = {University of Waterloo},
+    year        = 2010,
+    month       = feb,
+    address     = {Waterloo, Ontario, Canada, N2L 3G1},
+    note        = {\textsf{http://uwspace.uwaterloo.ca/\-bitstream/\-10012/\-5040/\-1/\-Harji\_thesis.pdf}},
+}

doc/theses/thierry_delisle_PhD/thesis/text/conclusion.tex

-              rebf8ca5
+              r23a08aa0
 \chapter{Conclusion}\label{conclusion}
+\Gls{uthrding} is popular.
+It makes sense for \CFA to use it.
+Building the \CFA runtime has been a challenging project.
+The work was divided between high-level concurrency design and a user-level threading runtime (Masters' thesis), and low-level support of the user-level runtime using OS kernel threading and its (multiple) I/O subsystems (Ph.D. thesis).
+Because I am the main developer for both components of this project, there is strong continuity across the design and implementation.
+This continuity provides a consistent approach to advanced control flow and concurrency, with easier development, management and maintenance of the runtime in the future.
+\todo{Obivously fix the above}
+I believed my Masters' work would provide the background to make the Ph.D. work reasonably straightforward.
+However, I discovered two significant challenges.
+An important aspect of this approach to threading is how threads are scheduled.
+As \CFA aims to increase productivity and safety of C while maintaining its performance, so to should the threading runtime achieve these goals.
+For scheduling, productivity and safety manifest in removing pitfalls in the efficient usage of the threading runtime.
+This thesis contributes to this goal by presenting a low-latency scheduler that offers improved starvation prevention compared to other state-of-the-art schedulers.
+It presents a core algorithm (Chapter~\ref{core}) that provides increased fairness through helping (Section~\ref{heling}) as well as optimizations which virtually remove the cost of this fairness (Section~\ref{relaxedtimes}).
+Building upon the fundamental scheduling algorithm, an implementation of user-level \io blocking is presented (Chapter~\ref{io}) which achieves the same performance and fairness balance as the scheduler itself.
+From these core algorithms, and a low-latency idle-sleep mechanism is presented (Chapter~\ref{practice}) which allows the \CFA runtime to stay viable for workloads that do not consistently saturate the system.
+First, modern symmetric multiprocessing CPUs have significant performance penalties for communication, often cache-related.
+An SQMS scheduler (see Section~\ref{sched}), with its \proc-shared ready-queue, has perfect load-balancing but poor affinity resulting in high communication across \procs.
+An MQMS scheduler, with its \proc-specific ready-queues, has poor load-balancing but perfect affinity often resulting in significantly reduced communication.
+However, implementing fairness for an MQMS scheduler is difficult, since fairness requires \procs to be aware of each other's ready-queue progress, \ie communicated knowledge.
+For balanced workloads with little or no data sharing, \ie embarrassingly parallel, an MQMS scheduler is near optimal, \eg a state-of-the-art work-stealing scheduler.
+For these kinds of fair workloads, adding fairness must be low-cost to hide the communication costs needed for global ready-queue progress or performance suffers.
+While I was aware of these realities, I underestimated how little performance margin there is for communication.
+Several of my attempts at building a fair scheduler compared poorly to work-stealing schedulers because of the thin communication margin.
+Second, the kernel locking, threading, and I/O in the Linux operating system offer very little flexibility and are not designed to facilitate user-level threading.
+There are multiple concurrency aspects in Linux that require carefully following a strict procedure to achieve acceptable performance.
+To be fair, many of these concurrency aspects were designed 30-40 years ago, when there were few multiprocessor computers and concurrency knowledge was just developing.
+Unfortunately, little has changed in the intervening years.
+Also, my decision to use @io_uring@ was both positive and negative.
+The positive is that @io_uring@ supports the panoply of I/O mechanisms in Linux;
+hence, the \CFA runtime uses one I/O mechanism to provide non-blocking I/O, rather than using @select@ to handle TTY I/O, @epoll@ to handle network I/O, and managing a thread pool to handle disk I/O.
+Merging all these different \io mechanisms into a coherent scheduling implementation would require much more work than what is present in this thesis, as well as detailed knowledge of multiple I/O mechanisms.
+The negative is that @io_uring@ is new and developing.
+As a result, there is limited documentation, few places to find usage examples, and multiple errors that required workarounds.
+Given what I now know about @io_uring@, I would say it is insufficiently coupled with the Linux kernel to properly handle non-blocking I/O.
+It does not seem to reach deep into the kernel's handling of \io, and as such it must contend with the same realities that users of @epoll@ must contend with.
+Specifically, in cases where @O_NONBLOCK@ behaves as desired, operations must still be retried.
+Preserving the illusion of asynchronicity requires delegating these operations to kernel threads.
+This requirement is also true of cases where @O_NONBLOCK@ does not prevent blocking.
+Spinning up internal kernel threads to handle blocking scenarios is what developers already do outside of the kernel, and managing these threads adds a significant burden to the system.
+Nonblocking I/O should not be handled in this way.
+\section{Goals}
+This work focuses on efficient and fair scheduling of the multiple CPUs, which are ubiquitous on all modern computers.
+The levels of indirection to the CPUs are:
+\begin{itemize}
+\item
+The \CFA presentation of concurrency through multiple high-level language constructs.
+\item
+The OS presentation of concurrency through multiple kernel threads within an application.
+\item
+The OS and library presentation of disk and network I/O, and many secondary library routines that directly and indirectly use these mechanisms.
+\end{itemize}
+The key aspect of all of these mechanisms is that control flow can block, which immediately hinders any level above from making scheduling decisions as a result.
+Fundamentally, scheduling needs to understand all the mechanisms used by threads that affect their state changes.
+The underlying goal of this thesis is scheduling the complex hardware components that make up a computer to provide good utilization and fairness.
+However, direct hardware scheduling is only possible in the OS.
+Instead, this thesis is performing arms-length application scheduling of the hardware components through a set of OS interfaces that indirectly manipulate the hardware components.
+This can quickly lead to tensions when the OS interface has different use cases in mind.
+As \CFA aims to increase productivity and safety of C, while maintaining its performance, this places a huge burden on the \CFA runtime to achieve these goals.
+Productivity and safety manifest in removing scheduling pitfalls in the efficient usage of the threading runtime.
+Performance manifests in making efficient use of the underlying kernel threads that provide indirect access to the CPUs.
+This thesis achieves its stated contributions by presenting:
+\begin{enumerate}[leftmargin=*]
+\item
+A scalable low-latency scheduler that offers improved starvation prevention (progress guarantee) compared to other state-of-the-art schedulers, including NUMA awareness.
+\item
+The scheduler demonstrates a core algorithm that provides increased fairness through helping, as well as optimizations which virtually remove the cost of this fairness.
+\item
+An implementation of user-level \io blocking is incorporated into the scheduler, which achieves the same performance and fairness balance as the scheduler itself.
+\item
+These core algorithms are further extended with a low-latency idle-sleep mechanism, which allows the \CFA runtime to stay viable for workloads that do not consistently saturate the system.
+\end{enumerate}
+Finally, the complete scheduler is fairly simple with low-cost execution, meaning the total cost of scheduling during thread state changes is low.
 \section{Future Work}
 While the \CFA runtime achieves a better compromise in term of performance and fairness than other schedulers, I do believe that further improvements could be made to reduce even further the number of cases where performance deteriorates.
 Furthermore, I believe that achieve performance and starvation freedom simultaneously is generally a challenge even outside of scheduling algorithms.
+While the \CFA runtime achieves a better compromise than other schedulers, in terms of performance and fairness, I believe further improvements can be made to reduce or eliminate the few cases where performance does deteriorate.
+Fundamentally, achieving performance and starvation freedom will always be goals with opposing needs even outside of scheduling algorithms.
 \subsection{Idle Sleep}
 A difficult challenge that was not fully address in this thesis is idle-sleep.
 While a correct and somewhat low-cost idle-sleep mechanism was presented, several of the benchmarks show notable performance degradation when too few \ats are present in the system.
+A difficult challenge, not fully addressed in this thesis, is idle sleep.
+While a correct and somewhat low-cost idle-sleep mechanism is presented, several of the benchmarks show notable performance degradation when too few \ats are present in the system.
 The idle sleep mechanism could therefore benefit from a reduction of spurious cases of sleeping.
 Furthermore, this thesis did not present any heuristic for when \procs should be put to sleep and when \procs should be woken up.
+It is especially worth noting that relaxed timestamps and topology aware helping lead to notable improvements in performance.
+Neither of these techniques were used for the idle sleep mechanism.
+While relaxed timestamps and topology awareness made notable performance improvements, neither of these techniques are used for the idle-sleep mechanism.
+There are opportunities where these techniques could be use:
+The mechanism uses a hand-shake between notification and sleep to ensure that no \at is missed.
+The correctness of that hand-shake is cirtical when the last \proc goes to sleep but could be relaxed when several \procs are awake.
+Furthermore, organizing the sleeping \procs as a LIDO stack makes sense to keep cold \procs as cold as possible, but it might be more appropriate to attempt to keep cold CPU sockets instead.
+However, using these techniques could require significant investigation.
+For example, keeping a CPU socket cold might be appropriate for power consumption reasons but can affect overall memory bandwith.
+The balance between these is not necessarily obvious.
+Here are opportunities where these techniques could be used:
+\begin{itemize}
+\item
+The mechanism uses a handshake between notification and sleep to ensure that no \at is missed.
+\item
+The handshake correctness is critical when the last \proc goes to sleep but could be relaxed when several \procs are awake.
+\item
+Furthermore, organizing the sleeping \procs as a LIFO stack makes sense to keep cold \procs as cold as possible, but it might be more appropriate to attempt to keep cold CPU sockets instead.
+\end{itemize}
+However, using these techniques would require significant investigation.
+For example, keeping a CPU socket cold might be appropriate for power consumption reasons but can affect overall memory bandwidth.
+The balance between these approaches is not obvious.
+I am aware there is a host of low-power research that could be tapped here.
 \subsection{Hardware}
 One challenge that needed to be overcome for this thesis was that the modern x86-64 has very few tools to implement fairness.
 \Glspl{proc} attempting to help eachother inherently cause cache-coherence traffic.
+One challenge that needed to be overcome for this thesis is that the modern x86-64 processors have very few tools to implement fairness.
+\Glspl{proc} attempting to help each other inherently cause cache-coherence traffic.
 However, as mentioned in Section~\ref{helping}, relaxed requirements mean this traffic is not necessarily productive.
 In cases like this one, there is an opportunity to improve performance by extending the hardware.
 Many different extensions would be suitable here.
 For example, when attempting to read remote timestamps when deciding to whether or not to help, it could be useful to allow cancelling the remote read if it will lead to significant latency.
 If the latency is due to a recent cache invalidation, it is unlikely that the timestamp is old and that helping will be needed.
+Many different extensions are suitable here.
+For example, when attempting to read remote timestamps for helping, it would be useful to allow cancelling the remote read if it leads to significant latency.
+If the latency is due to a recent cache invalidation, it is unlikely the timestamp is old and that helping is needed.
 As such, simply moving on without the result is likely to be acceptable.
 Another option would be to attempt to read multiple memory addresses and only wait for \emph{one of} these reads to retire.
 This would have a similar effect, where cache-lines with more traffic would be waited on less often.
 In both of these examples, some care would probably be needed to make sure that the reads to an address \emph{sometimes} retire.
+Another option is to read multiple memory addresses and only wait for \emph{one of} these reads to retire.
+This approach has a similar effect, where cache lines with more traffic are waited on less often.
+In both of these examples, some care is needed to ensure that reads to an address \emph{sometimes} retire.
 Note that this is similar to the feature \newterm{Hardware Transactional Memory}~\cite{HTM}, which allows groups of instructions to be aborted and rolled-back if they encounter memory conflicts when being retired.
+Note that this idea is similar to \newterm{Hardware Transactional Memory}~\cite{wiki:htm}, which allows groups of instructions to be aborted and rolled back if they encounter memory conflicts when being retired.
 However, I believe this feature is generally aimed at large groups of instructions.
 A more fine-grained approach may be more amenable to carefully picking which aspects of an algorithm require exact correctness and which do not.
+A more fine-grained approach may be more amenable by carefully picking which aspects of an algorithm require exact correctness and which do not.

doc/theses/thierry_delisle_PhD/thesis/text/core.tex

-              rebf8ca5
+              r23a08aa0
 Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state.
 For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers.
+For this purpose, a steady state is loosely defined as a state where there are always \ats ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers.
 In short, the system is neither overloaded nor underloaded.
 It is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected.
 As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new load and return to the steady state, \eg, by adding or removing workers.
+As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new \gls{load} and return to the steady state, \eg, by adding or removing workers.
 Therefore, flaws in scheduling the steady state tend to be pervasive in all states.
 \section{Design Goals}
 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental-model.
 To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental-model, the system also respects this model.
 For threading, a simple and common execution mental-model is the ``Ideal multi-tasking CPU'' :
+As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental model.
+To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental model, the system also respects this model.
+For threading, a simple and common execution mental model is the ``ideal multitasking CPU'':
 \begin{displayquote}[Linux CFS\cite{MAN:linux/cfs}]
         {[The]} ``Ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 tasks running, then it runs each at 50\% physical power --- i.e., actually in parallel.
+        {[The]} ``ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 running tasks, then it runs each at 50\% physical power --- i.e., actually in parallel.
         \label{q:LinuxCFS}
 \end{displayquote}
 Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
 In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with each other but simply share the hardware.
 This assumption makes it easier to reason about threading because ready \glspl{thrd} can be thought of in isolation and the effect of the scheduler can be virtually ignored.
 This expectation of \gls{thrd} independence means the scheduler is expected to offer two guarantees:
+Applied to \ats, this model states that every ready \at immediately runs in parallel with all other ready \ats. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
+In general, the expectation at the centre of this model is that ready \ats do not interfere with each other but simply share the hardware.
+This assumption makes it easier to reason about threading because ready \ats can be thought of in isolation and the effect of the scheduler can be virtually ignored.
+This expectation of \at independence means the scheduler is expected to offer two guarantees:
 \begin{enumerate}
         \item A fairness guarantee: a \gls{thrd} that is ready to run is not prevented by another thread.
         \item A performance guarantee: a \gls{thrd} that wants to start or stop running is not prevented by other threads wanting to do the same.
+        \item A fairness guarantee: a \at that is ready to run is not prevented by another thread.
+        \item A performance guarantee: a \at that wants to start or stop running is not prevented by other threads wanting to do the same.
 \end{enumerate}
 It is important to note that these guarantees are expected only up to a point.
 \Glspl{thrd} that are ready to run should not be prevented to do so, but they still share the limited hardware resources.
 Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware resources, even if that share is very small.
+\Glspl{at} that are ready to run should not be prevented from doing so, but they still share the limited hardware resources.
+Therefore, the guarantee is considered respected if a \at gets access to a \emph{fair share} of the hardware resources, even if that share is very small.
 Similar to the performance guarantee, the lack of interference among threads is only relevant up to a point.
 …
 This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models.
 Recall programmer expectation is that the impact of the scheduler can be ignored.
 Therefore, if the cost of scheduling is competitive to other popular languages, the guarantee is consider achieved.
+Therefore, if the cost of scheduling is competitive with other popular languages, the guarantee is considered achieved.
 More precisely the scheduler should be:
 \begin{itemize}
 …
 In any running system, a \proc can stop dequeuing \ats if it starts running a \at that never blocks.
 Without preemption, traditional work-stealing schedulers do not have starvation freedom in this case.
 Now this requirement begs the question, what about preemption?
 Generally speaking preemption happens on the timescale of several milliseconds, which brings us to the next requirement: ``fast'' load balancing.
+Now, this requirement begs the question, what about preemption?
+Generally speaking, preemption happens on the timescale of several milliseconds, which brings us to the next requirement: ``fast'' load balancing.
 \paragraph{Fast load balancing} means that load balancing should happen faster than preemption would normally allow.
 For interactive applications that need to run at 60, 90, 120 frames per second, \ats having to wait for several milliseconds to run are effectively starved.
+For interactive applications that need to run at 60, 90 or 120 frames per second, \ats having to wait for several milliseconds to run are effectively starved.
 Therefore load-balancing should be done at a faster pace, one that can detect starvation at the microsecond scale.
 With that said, this is a much fuzzier requirement since it depends on the number of \procs, the number of \ats and the general load of the system.
+With that said, this is a much fuzzier requirement since it depends on the number of \procs, the number of \ats and the general \gls{load} of the system.
 \subsection{Fairness vs Scheduler Locality} \label{fairnessvlocal}
 …
 For a scheduler, having good locality, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness.
 Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available.
 Note that this section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling.
+Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \at, and as consequence cache lines, to a \gls{hthrd} that is currently available.
+Note that this section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how scheduling affects the locality of the application's data.
 External locality is a much more complicated subject and is discussed in the next section.
 However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally.
 Figure~\ref{fig:fair} shows a visual representation of this behaviour.
 As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental-model.
+As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental model.
 \begin{figure}
 …
         \input{fairness.pstex_t}
         \vspace*{-10pt}
         \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache decreases, Locality.
         At the same time, the need for fairness increases since other \glspl{thrd} may have the chance to run many times, breaking the fairness model.
+        \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \at awaits running is shown as the time the ready \at waits increases (Ready Time) the chances that its data is still in cache decreases (Locality).
+        At the same time, the need for fairness increases since other \ats may have the chance to run many times, breaking the fairness model.
         Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.}
         \label{fig:fair}
 …
 \subsubsection{Scalability}
 The most basic performance challenge of a scheduler is scalability.
 Given a large number of \procs and an even larger number of \ats, scalability measures how fast \procs can enqueue and dequeues \ats.
 One could expect that doubling the number of \procs would double the rate at which \ats are dequeued, but contention on the internal data structure of the scheduler can lead to worst improvements.
 While the ready-queue itself can be sharded to alleviate the main source of contention, auxiliary scheduling features, \eg counting ready \ats, can also be sources of contention.
+Given a large number of \procs and an even larger number of \ats, scalability measures how fast \procs can enqueue and dequeue \ats.
+One could expect that doubling the number of \procs would double the rate at which \ats are dequeued, but contention on the internal data structure of the scheduler can diminish the improvements.
+While the ready queue itself can be sharded to alleviate the main source of contention, auxiliary scheduling features, \eg counting ready \ats, can also be sources of contention.
 \subsubsection{Migration Cost}
 Another important source of scheduling latency is migration.
+Another important source of scheduling latency is \glslink{atmig}{migration}.
 A \at migrates if it executes on two different \procs consecutively, which is the process discussed in \ref{fairnessvlocal}.
 Migrations can have many different causes, but in certain programs, it can be impossible to limit migration.
 …
 The problem is a single point of contention when adding/removing \ats.
 As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}.
 The solution to this problem is to shard the ready-queue: create multiple \emph{subqueues} forming the logical ready-queue and the subqueues are accessed by multiple \glspl{hthrd} without interfering.
+The solution to this problem is to shard the ready queue: create multiple \emph{sub-queues} forming the logical ready-queue and the sub-queues are accessed by multiple \glspl{hthrd} without interfering.
 Before going into the design of \CFA's scheduler, it is relevant to discuss two sharding solutions that served as the inspiration scheduler in this thesis.
 …
 \subsection{Work-Stealing}
 As mentioned in \ref{existing:workstealing}, a popular sharding approach for the ready-queue is work-stealing.
 In this approach, each \gls{proc} has its own local subqueue and \glspl{proc} only access each other's subqueue if they run out of work on their local ready-queue.
+As mentioned in \ref{existing:workstealing}, a popular sharding approach for the ready queue is work-stealing.
+In this approach, each \gls{proc} has its own local sub-queue and \glspl{proc} only access each other's sub-queue if they run out of work on their local ready-queue.
 The interesting aspect of work stealing happens in the steady-state scheduling case, \ie all \glspl{proc} have work and no load balancing is needed.
 In this case, work stealing is close to optimal scheduling: it can achieve perfect locality and have no contention.
 On the other hand, work-stealing schedulers only attempt to do load-balancing when a \gls{proc} runs out of work.
 This means that the scheduler never balances unfair loads unless they result in a \gls{proc} running out of work.
 Chapter~\ref{microbench} shows that pathological cases work stealing can lead to indefinite starvation.
 Based on these observation, the conclusion is that a \emph{perfect} scheduler should behave similar to work-stealing in the steady-state case, but load balance proactively when the need arises.
+Chapter~\ref{microbench} shows that, in pathological cases, work stealing can lead to indefinite starvation.
+Based on these observations, the conclusion is that a \emph{perfect} scheduler should behave similarly to work-stealing in the steady-state case, but load balance proactively when the need arises.
 \subsection{Relaxed-FIFO}
 A different scheduling approach is to create a ``relaxed-FIFO'' queue, as in \todo{cite Trevor's paper}.
 This approach forgoes any ownership between \gls{proc} and subqueue, and simply creates a pool of ready-queues from which \glspl{proc} pick.
+A different scheduling approach is to create a ``relaxed-FIFO'' queue, as in \cite{alistarh2018relaxed}.
+This approach forgoes any ownership between \gls{proc} and sub-queue, and simply creates a pool of sub-queues from which \glspl{proc} pick.
 Scheduling is performed as follows:
 \begin{itemize}
 \item
 All subqueues are protected by TryLocks.
 \item
 Timestamps are added to each element of a subqueue.
 \item
 A \gls{proc} randomly tests ready queues until it has acquired one or two queues.
 \item
 If two queues are acquired, the older of the two \ats at the front the acquired queues is dequeued.
 \item
 Otherwise the \ats from the single queue is dequeued.
+All sub-queues are protected by TryLocks.
+\item
+Timestamps are added to each element of a sub-queue.
+\item
+A \gls{proc} randomly tests sub-queues until it has acquired one or two queues.
+\item
+If two queues are acquired, the older of the two \ats is dequeued from the front of the acquired queues.
+\item
+Otherwise, the \at from the single queue is dequeued.
 \end{itemize}
 The result is a queue that has both good scalability and sufficient fairness.
 The lack of ownership ensures that as long as one \gls{proc} is still able to repeatedly dequeue elements, it is unlikely any element will delay longer than any other element.
 This guarantee contrasts with work-stealing, where a \gls{proc} with a long subqueue results in unfairness for its \ats in comparison to a \gls{proc} with a short subqueue.
+This guarantee contrasts with work-stealing, where a \gls{proc} with a long sub-queue results in unfairness for its \ats in comparison to a \gls{proc} with a short sub-queue.
 This unfairness persists until a \gls{proc} runs out of work and steals.
 An important aspects of this scheme's fairness approach is that the timestamps make it possible to evaluate how long elements have been on the queue.
+An important aspect of this scheme's fairness approach is that the timestamps make it possible to evaluate how long elements have been in the queue.
 However, \glspl{proc} eagerly search for these older elements instead of focusing on specific queues, which negatively affects locality.
 …
 \section{Relaxed-FIFO++}
 The inherent fairness and good performance with many \ats, makes the relaxed-FIFO queue a good candidate to form the basis of a new scheduler.
+The inherent fairness and good performance with many \ats make the relaxed-FIFO queue a good candidate to form the basis of a new scheduler.
 The problem case is workloads where the number of \ats is barely greater than the number of \procs.
 In these situations, the wide sharding of the ready queue means most of its subqueues are empty.
 Furthermore, the non-empty subqueues are unlikely to hold more than one item.
 The consequence is that a random dequeue operation is likely to pick an empty subqueue, resulting in an unbounded number of selections.
 This state is generally unstable: each subqueue is likely to frequently toggle between being empty and nonempty.
 Indeed, when the number of \ats is \emph{equal} to the number of \procs, every pop operation is expected to empty a subqueue and every push is expected to add to an empty subqueue.
 In the worst case, a check of the subqueues sees all are empty or full.
+In these situations, the wide sharding of the ready queue means most of its sub-queues are empty.
+Furthermore, the non-empty sub-queues are unlikely to hold more than one item.
+The consequence is that a random dequeue operation is likely to pick an empty sub-queue, resulting in an unbounded number of selections.
+This state is generally unstable: each sub-queue is likely to frequently toggle between being empty and nonempty.
+Indeed, when the number of \ats is \emph{equal} to the number of \procs, every pop operation is expected to empty a sub-queue and every push is expected to add to an empty sub-queue.
+In the worst case, a check of the sub-queues sees all are empty or full.
 As this is the most obvious challenge, it is worth addressing first.
 The obvious solution is to supplement each sharded subqueue with data that indicates if the queue is empty/nonempty to simplify finding nonempty queues, \ie ready \glspl{at}.
 This sharded data can be organized in different forms, \eg a bitmask or a binary tree that tracks the nonempty subqueues.
+The obvious solution is to supplement each sharded sub-queue with data that indicates if the queue is empty/nonempty to simplify finding nonempty queues, \ie ready \glspl{at}.
+This sharded data can be organized in different forms, \eg a bitmask or a binary tree that tracks the nonempty sub-queues.
 Specifically, many modern architectures have powerful bitmask manipulation instructions or searching a binary tree has good Big-O complexity.
 However, precisely tracking nonempty subqueues is problematic.
 The reason is that the subqueues are initially sharded with a width presumably chosen to avoid contention.
 However, tracking which ready queue is nonempty is only useful if the tracking data is dense, \ie denser than the sharded subqueues.
 Otherwise, it does not provide useful information because reading this new data structure risks being as costly as simply picking a subqueue at random.
 But if the tracking mechanism \emph{is} denser than the shared subqueues, than constant updates invariably create a new source of contention.
+However, precisely tracking nonempty sub-queues is problematic.
+The reason is that the sub-queues are initially sharded with a width presumably chosen to avoid contention.
+However, tracking which ready queue is nonempty is only useful if the tracking data is dense, \ie denser than the sharded sub-queues.
+Otherwise, it does not provide useful information because reading this new data structure risks being as costly as simply picking a sub-queue at random.
+But if the tracking mechanism \emph{is} denser than the shared sub-queues, then constant updates invariably create a new source of contention.
 Early experiments with this approach showed that randomly picking, even with low success rates, is often faster than bit manipulations or tree walks.
 The exception to this rule is using local tracking.
 If each \proc locally keeps track of empty subqueues, than this can be done with a very dense data structure without introducing a new source of contention.
+If each \proc locally keeps track of empty sub-queues, then this can be done with a very dense data structure without introducing a new source of contention.
 However, the consequence of local tracking is that the information is incomplete.
 Each \proc is only aware of the last state it saw about each subqueue so this information quickly becomes stale.
+Each \proc is only aware of the last state it saw about each sub-queue so this information quickly becomes stale.
 Even on systems with low \gls{hthrd} count, \eg 4 or 8, this approach can quickly lead to the local information being no better than the random pick.
 This result is due in part to the cost of maintaining information and its poor quality.
 However, using a very low cost but inaccurate approach for local tracking can actually be beneficial.
 If the local tracking is no more costly than a random pick, than \emph{any} improvement to the success rate, however low it is, leads to a performance benefits.
 This suggests to the following approach:
+However, using a very low-cost but inaccurate approach for local tracking can still be beneficial.
+If the local tracking is no more costly than a random pick, then \emph{any} improvement to the success rate, however low it is, leads to a performance benefit.
+This suggests the following approach:
 \subsection{Dynamic Entropy}\cite{xkcd:dynamicentropy}
 The Relaxed-FIFO approach can be made to handle the case of mostly empty subqueues by tweaking the \glsxtrlong{prng}.
 The \glsxtrshort{prng} state can be seen as containing a list of all the future subqueues that will be accessed.
 While this concept is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the subqueues that were accessed.
+The Relaxed-FIFO approach can be made to handle the case of mostly empty sub-queues by tweaking the \glsxtrlong{prng}.
+The \glsxtrshort{prng} state can be seen as containing a list of all the future sub-queues that will be accessed.
+While this concept is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the sub-queues that were accessed.
 Luckily, bidirectional \glsxtrshort{prng} algorithms do exist, \eg some Linear Congruential Generators\cite{wiki:lcg} support running the algorithm backwards while offering good quality and performance.
 This particular \glsxtrshort{prng} can be used as follows:
 \begin{itemize}
 \item
 Each \proc maintains two \glsxtrshort{prng} states, refereed to as $F$ and $B$.
 \item
 When a \proc attempts to dequeue a \at, it picks a subqueue by running $B$ backwards.
 \item
 When a \proc attempts to enqueue a \at, it runs $F$ forward picking a subqueue to enqueue to.
 If the enqueue is successful, the state $B$ is overwritten with the content of $F$.
+Each \proc maintains two \glsxtrshort{prng} states, referred to as $F$ and $B$.
+\item
+When a \proc attempts to dequeue a \at, it picks a sub-queue by running $B$ backwards.
+\item
+When a \proc attempts to enqueue a \at, it runs $F$ forward picking a sub-queue to enqueue to.
+If the enqueue is successful, state $B$ is overwritten with the content of $F$.
 \end{itemize}
 The result is that each \proc tends to dequeue \ats that it has itself enqueued.
 When most subqueues are empty, this technique increases the odds of finding \ats at very low cost, while also offering an improvement on locality in many cases.
+When most sub-queues are empty, this technique increases the odds of finding \ats at a very low cost, while also offering an improvement on locality in many cases.
 Tests showed this approach performs better than relaxed-FIFO in many cases.
 However, it is still not competitive with work-stealing algorithms.
 The fundamental problem is that the constant randomness limits how much locality the scheduler offers.
 This becomes problematic both because the scheduler is likely to get cache misses on internal data-structures and because migrations become frequent.
+This becomes problematic both because the scheduler is likely to get cache misses on internal data structures and because migrations become frequent.
 Therefore, the attempt to modify the relaxed-FIFO algorithm to behave more like work stealing did not pan out.
 The alternative is to do it the other way around.
 …
 \section{Work Stealing++}\label{helping}
 To add stronger fairness guarantees to work stealing a few changes are needed.
 First, the relaxed-FIFO algorithm has fundamentally better fairness because each \proc always monitors all subqueues.
+First, the relaxed-FIFO algorithm has fundamentally better fairness because each \proc always monitors all sub-queues.
 Therefore, the work-stealing algorithm must be prepended with some monitoring.
 Before attempting to dequeue from a \proc's subqueue, the \proc must make some effort to ensure other subqueues are not being neglected.
+Before attempting to dequeue from a \proc's sub-queue, the \proc must make some effort to ensure other sub-queues are not being neglected.
 To make this possible, \procs must be able to determine which \at has been on the ready queue the longest.
 Second, the relaxed-FIFO approach needs timestamps for each \at to make this possible.
 …
         \centering
         \input{base.pstex_t}
         \caption[Base \CFA design]{Base \CFA design \smallskip\newline A pool of subqueues offers the sharding, two per \glspl{proc}.
         Each \gls{proc} can access all of the subqueues.
+        \caption[Base \CFA design]{Base \CFA design \smallskip\newline A pool of sub-queues offers the sharding, two per \proc.
+        Each \gls{proc} can access all of the sub-queues.
         Each \at is timestamped when enqueued.}
         \label{fig:base}
 …
 Figure~\ref{fig:base} shows the algorithm structure.
 This structure is similar to classic work-stealing except the subqueues are placed in an array so \procs can access them in constant time.
+This structure is similar to classic work-stealing except the sub-queues are placed in an array so \procs can access them in constant time.
 Sharding width can be adjusted based on contention.
 Note, as an optimization, the TS of a \at is stored in the \at in front of it, so the first TS is in the array and the last \at has no TS.
 This organization keeps the highly accessed front TSs directly in the array.
 When a \proc attempts to dequeue a \at, it first picks a random remote subqueue and compares its timestamp to the timestamps of its local subqueue(s).
+When a \proc attempts to dequeue a \at, it first picks a random remote sub-queue and compares its timestamp to the timestamps of its local sub-queue(s).
 The oldest waiting \at is dequeued to provide global fairness.
 However, this na\"ive implemented has performance problems.
+However, this na\"ive implementation has performance problems.
 First, it is necessary to have some damping effect on helping.
 Random effects like cache misses and preemption can add spurious but short bursts of latency negating the attempt to help.
 These bursts can cause increased migrations and make this work stealing approach slowdown to the level of relaxed-FIFO.
+These bursts can cause increased migrations and make this work-stealing approach slow down to the level of relaxed-FIFO.
 \begin{figure}
         \centering
         \input{base_avg.pstex_t}
         \caption[\CFA design with Moving Average]{\CFA design with Moving Average \smallskip\newline A moving average is added to each subqueue.}
+        \caption[\CFA design with Moving Average]{\CFA design with Moving Average \smallskip\newline A moving average is added to each sub-queue.}
         \label{fig:base-ma}
 \end{figure}
 A simple solution to this problem is to use an exponential moving average\cite{wiki:ma} (MA) instead of a raw timestamps, shown in Figure~\ref{fig:base-ma}.
 Note, this is more complex because the \at at the head of a subqueue is still waiting, so its wait time has not ended.
 Therefore, the exponential moving average is actually an exponential moving average of how long each dequeued \at has waited.
 To compare subqueues, the timestamp at the head must be compared to the current time, yielding the best-case wait-time for the \at at the head of the queue.
+A simple solution to this problem is to use an exponential moving average\cite{wiki:ma} (MA) instead of a raw timestamp, as shown in Figure~\ref{fig:base-ma}.
+Note that this is more complex because the \at at the head of a sub-queue is still waiting, so its wait time has not ended.
+Therefore, the exponential moving average is an average of how long each dequeued \at has waited.
+To compare sub-queues, the timestamp at the head must be compared to the current time, yielding the best-case wait time for the \at at the head of the queue.
 This new waiting is averaged with the stored average.
 To further limit migration, a bias can be added to a local subqueue, where a remote subqueue is helped only if its moving average is more than $X$ times the local subqueue's average.
+To further limit \glslink{atmig}{migrations}, a bias can be added to a local sub-queue, where a remote sub-queue is helped only if its moving average is more than $X$ times the local sub-queue's average.
 Tests for this approach indicate the choice of the weight for the moving average or the bias is not important, \ie weights and biases of similar \emph{magnitudes} have similar effects.
 With these additions to work stealing, scheduling can be made as fair as the relaxed-FIFO approach, avoiding the majority of unnecessary migrations.
 Unfortunately, the work to achieve fairness has a performance cost, especially when the workload is inherently fair, and hence, there is only short-term or no starvation.
 The problem is that the constant polling, \ie reads, of remote subqueues generally entail a cache miss because the TSs are constantly being updated, \ie, writes.
 To make things worst, remote subqueues that are very active, \ie \ats are frequently enqueued and dequeued from them, lead to higher chances that polling will incur a cache-miss.
 Conversely, the active subqueues do not benefit much from helping since starvation is already a non-issue.
 This puts this algorithm in the awkward situation of paying for a cost that is largely unnecessary.
 The good news is that this problem can be mitigated
 \subsection{Redundant Timestamps}\ref{relaxedtimes}
 The problem with polling remote subqueues is that correctness is critical.
 There must be a consensus among \procs on which subqueues hold which \ats, as the \ats are in constant motion.
 Furthermore, since timestamps are use for fairness, it is critical to have consensus on which \at is the oldest.
 However, when deciding if a remote subqueue is worth polling, correctness is less of a problem.
 Since the only requirement is that a subqueue is eventually polled, some data staleness is acceptable.
+Unfortunately, the work to achieve fairness has a performance cost, especially when the workload is inherently fair, and hence, there is only short-term unfairness or no starvation.
+The problem is that the constant polling, \ie reads, of remote sub-queues generally entails cache misses because the TSs are constantly being updated, \ie, writes.
+To make things worst, remote sub-queues that are very active, \ie \ats are frequently enqueued and dequeued from them, lead to higher chances that polling will incur a cache-miss.
+Conversely, the active sub-queues do not benefit much from helping since starvation is already a non-issue.
+This puts this algorithm in the awkward situation of paying for a largely unnecessary cost.
+The good news is that this problem can be mitigated.
+\subsection{Redundant Timestamps}\label{relaxedtimes}
+The problem with polling remote sub-queues is that correctness is critical.
+There must be a consensus among \procs on which sub-queues hold which \ats, as the \ats are in constant motion.
+Furthermore, since timestamps are used for fairness, it is critical to have a consensus on which \at is the oldest.
+However, when deciding if a remote sub-queue is worth polling, correctness is less of a problem.
+Since the only requirement is that a sub-queue is eventually polled, some data staleness is acceptable.
 This leads to a situation where stale timestamps are only problematic in some cases.
 Furthermore, stale timestamps can be desirable since lower freshness requirements mean less cache invalidations.
+Furthermore, stale timestamps can be desirable since lower freshness requirements mean fewer cache invalidations.
 Figure~\ref{fig:base-ts2} shows a solution with a second array containing a copy of the timestamps and average.
 This copy is updated \emph{after} the subqueue's critical sections using relaxed atomics.
+This copy is updated \emph{after} the sub-queue's critical sections using relaxed atomics.
 \Glspl{proc} now check if polling is needed by comparing the copy of the remote timestamp instead of the actual timestamp.
 The result is that since there is no fencing, the writes can be buffered in the hardware and cause fewer cache invalidations.
 …
         \input{base_ts2.pstex_t}
         \caption[\CFA design with Redundant Timestamps]{\CFA design with Redundant Timestamps \smallskip\newline An array is added containing a copy of the timestamps.
         These timestamps are written to with relaxed atomics, so there is no order among concurrent memory accesses, leading to fewer cache invalidations.}
+        These timestamps are written-to with relaxed atomics, so there is no order among concurrent memory accesses, leading to fewer cache invalidations.}
         \label{fig:base-ts2}
 \end{figure}
 …
 The correctness argument is somewhat subtle.
 The data used for deciding whether or not to poll a queue can be stale as long as it does not cause starvation.
 Therefore, it is acceptable if stale data makes queues appear older than they really are but appearing fresher can be a problem.
 For the timestamps, this means missing writes to the timestamp is acceptable since they make the head \at look older.
+Therefore, it is acceptable if stale data makes queues appear older than they are but appearing fresher can be a problem.
+For the timestamps, this means it is acceptable to miss writes to the timestamp since they make the head \at look older.
 For the moving average, as long as the operations are just atomic reads/writes, the average is guaranteed to yield a value that is between the oldest and newest values written.
 Therefore, this unprotected read of the timestamp and average satisfy the limited correctness that is required.
+Therefore, this unprotected read of the timestamp and average satisfies the limited correctness that is required.
 With redundant timestamps, this scheduling algorithm achieves both the fairness and performance requirements on most machines.
 The problem is that the cost of polling and helping is not necessarily consistent across each \gls{hthrd}.
 For example, on machines with a CPU containing multiple hyperthreads and cores and multiple CPU sockets, cache misses can be satisfied from the caches on same (local) CPU, or by a CPU on a different (remote) socket.
+For example on machines with a CPU containing multiple hyper threads and cores and multiple CPU sockets, cache misses can be satisfied from the caches on the same (local) CPU, or by a CPU on a different (remote) socket.
 Cache misses satisfied by a remote CPU have significantly higher latency than from the local CPU.
 However, these delays are not specific to systems with multiple CPUs.
 …
 In Figure~\ref{fig:cache-share}, all cache misses are either private to a CPU or shared with another CPU.
 This means latency due to cache misses is fairly consistent.
 In contrast, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by either instance of L3 cache.
+In contrast, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by either instance of the L3 cache.
 However, the memory-access latency to the remote L3 is higher than the memory-access latency to the local L3.
 The impact of these different designs on this algorithm is that scheduling only scales well on architectures with a wide L3 cache, similar to Figure~\ref{fig:cache-share}, and less well on architectures with many narrower L3 cache instances, similar to Figure~\ref{fig:cache-noshare}.
 Hence, as the number of L3 instances grow, so too does the chance that the random helping causes significant cache latency.
 The solution is for the scheduler be aware of the cache topology.
+Hence, as the number of L3 instances grows, so too does the chance that the random helping causes significant cache latency.
+The solution is for the scheduler to be aware of the cache topology.
 \subsection{Per CPU Sharding}
 …
 Unfortunately, there is no portable way to discover cache topology, and it is outside the scope of this thesis to solve this problem.
 This work uses the cache topology information from Linux's @/sys/devices/system/cpu@ directory.
 This leaves the challenge of matching \procs to cache structure, or more precisely identifying which subqueues of the ready queue are local to which subcomponents of the cache structure.
 Once a matching is generated, the helping algorithm is changed to add bias so that \procs more often help subqueues local to the same cache substructure.\footnote{
+This leaves the challenge of matching \procs to cache structure, or more precisely identifying which sub-queues of the ready queue are local to which subcomponents of the cache structure.
+Once a match is generated, the helping algorithm is changed to add bias so that \procs more often help sub-queues local to the same cache substructure.\footnote{
 Note that like other biases mentioned in this section, the actual bias value does not appear to need precise tuning.}
 The simplest approach for mapping subqueues to cache structure is to statically tie subqueues to CPUs.
 Instead of having each subqueue local to a specific \proc, the system is initialized with subqueues for each hardware hyperthread/core up front.
 Then \procs dequeue and enqueue by first asking which CPU id they are executing on, in order to identify which subqueues are the local ones.
+The simplest approach for mapping sub-queues to cache structure is to statically tie sub-queues to CPUs.
+Instead of having each sub-queue local to a specific \proc, the system is initialized with sub-queues for each hardware hyperthread/core up front.
+Then \procs dequeue and enqueue by first asking which CPU id they are executing on, to identify which sub-queues are the local ones.
 \Glspl{proc} can get the CPU id from @sched_getcpu@ or @librseq@.
 This approach solves the performance problems on systems with topologies with narrow L3 caches, similar to Figure \ref{fig:cache-noshare}.
 However, it can still cause some subtle fairness problems in systems with few \procs and many \glspl{hthrd}.
 In this case, the large number of subqueues and the bias against subqueues tied to different cache substructures make it unlikely that every subqueue is picked.
 To make things worst, the small number of \procs mean that few helping attempts are made.
 This combination of low selection and few helping attempts allow a \at to become stranded on a subqueue for a long time until it gets randomly helped.
 On a system with 2 \procs, 256 \glspl{hthrd} with narrow cache sharing, and a 100:1 bias, it can actually take multiple seconds for a \at to get dequeued from a remote queue.
 Therefore, a more dynamic matching of subqueues to cache instance is needed.
+In this case, the large number of sub-queues and the bias against sub-queues tied to different cache substructures make it unlikely that every sub-queue is picked.
+To make things worst, the small number of \procs means that few helping attempts are made.
+This combination of low selection and few helping attempts allow a \at to become stranded on a sub-queue for a long time until it gets randomly helped.
+On a system with 2 \procs, 256 \glspl{hthrd} with narrow cache sharing, and a 100:1 bias, it can take multiple seconds for a \at to get dequeued from a remote queue.
+Therefore, a more dynamic match of sub-queues to cache instances is needed.
 \subsection{Topological Work Stealing}
 \label{s:TopologicalWorkStealing}
 Therefore, the approach used in the \CFA scheduler is to have per-\proc subqueues, but have an explicit data-structure track which cache substructure each subqueue is tied to.
+The approach used in the \CFA scheduler is to have per-\proc sub-queues, but have an explicit data structure to track which cache substructure each sub-queue is tied to.
 This tracking requires some finesse because reading this data structure must lead to fewer cache misses than not having the data structure in the first place.
 A key element however is that, like the timestamps for helping, reading the cache instance mapping only needs to give the correct result \emph{often enough}.
+A key element, however, is that, like the timestamps for helping, reading the cache instance mapping only needs to give the correct result \emph{often enough}.
 Therefore the algorithm can be built as follows: before enqueueing or dequeuing a \at, each \proc queries the CPU id and the corresponding cache instance.
 Since subqueues are tied to \procs, each \proc can then update the cache instance mapped to the local subqueue(s).
 To avoid unnecessary cache line invalidation, the map is only written to if the mapping changes.
+Since sub-queues are tied to \procs, each \proc can then update the cache instance mapped to the local sub-queue(s).
+To avoid unnecessary cache line invalidation, the map is only written-to if the mapping changes.
 This scheduler is used in the remainder of the thesis for managing CPU execution, but additional scheduling is needed to handle long-term blocking and unblocking, such as I/O.

doc/theses/thierry_delisle_PhD/thesis/text/eval_macro.tex

-              rebf8ca5
+              r23a08aa0
 The previous chapter demonstrated the \CFA scheduler achieves its equivalent performance goal in small and controlled \at-scheduling scenarios.
 The next step is to demonstrate performance stays true in more realistic and complete scenarios.
 Therefore, this chapter exercises both \at and I/O scheduling using two flavours of webservers that demonstrate \CFA performs competitively with production environments.
 Webservers are chosen because they offer fairly simple applications that perform complex I/O, both network and disk, and are useful as standalone products.
 Furthermore, webservers are generally amenable to parallelization since their workloads are mostly homogeneous.
 Therefore, webservers offer a stringent performance benchmark for \CFA.
 Indeed, existing webservers have close to optimal performance, while the homogeneity of the workload means fairness may not be a problem.
 As such, these experiments should highlight the overhead tue to any \CFA fairness cost in realistic scenarios.
+Therefore, this chapter exercises both \at and I/O scheduling using two flavours of web servers that demonstrate \CFA performs competitively compared to web servers used in production environments.
+Web servers are chosen because they offer fairly simple applications that perform complex I/O, both network and disk, and are useful as standalone products.
+Furthermore, web servers are generally amenable to parallelization since their workloads are mostly homogeneous.
+Therefore, web servers offer a stringent performance benchmark for \CFA.
+Indeed, existing web servers have close to optimal performance, while the homogeneity of the workload means fairness may not be a problem.
+As such, these experiments should highlight the overhead due to any \CFA fairness cost in realistic scenarios.
 \section{Memcached}
 Memcached~\cite{memcached} is an in-memory key-value store used in many production environments, \eg \cite{atikoglu2012workload}.
 In fact, the Memcached server is so popular there exists a full-featured front-end for performance testing, called @mutilate@~\cite{GITHUB:mutilate}.
 Experimenting on Memcached allows for a simple test of the \CFA runtime as a whole, exercising the scheduler, the idle-sleep mechanism, as well the \io subsystem for sockets.
 Note, this experiment does not exercise the \io subsystem with regards to disk operations because Memcached is an in-memory server.
+The Memcached server is so popular there exists a full-featured front-end for performance testing, called @mutilate@~\cite{GITHUB:mutilate}.
+Experimenting on Memcached allows for a simple test of the \CFA runtime as a whole, exercising the scheduler, the idle-sleep mechanism, as well as the \io subsystem for sockets.
+Note that this experiment does not exercise the \io subsystem with regard to disk operations because Memcached is an in-memory server.
 \subsection{Benchmark Environment}
 …
 Each node has 2 Intel(R) Xeon(R) CPU E5-2620 v2 running at 2.10GHz.
 \item
 These CPUs have 6 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 24 \glspl{hthrd}.
 \item
 The CPUs each have 384 KB, 3 MB and 30 MB of L1, L2 and L3 caches respectively.
 \item
 Each node is connected to the network through a Mellanox 10 Gigabit Ethernet port.
+Each CPU has 6 cores and 2 \glspl{hthrd} per core, for a total of 24 \glspl{hthrd}.
+\item
+A CPU has 384 KB, 3 MB and 30 MB of L1, L2 and L3 caches, respectively.
+\item
+The compute nodes are connected to the network through a Mellanox 10 Gigabit Ethernet port.
 \item
 Network routing is performed by a Mellanox SX1012 10/40 Gigabit Ethernet switch.
 …
 \subsection{Memcached threading}\label{memcd:thrd}
 Memcached can be built to use multiple threads in addition to its @libevent@ subsystem to handle requests.
 When enabled, the threading implementation operates as follows~\cite{https://docs.oracle.com/cd/E17952_01/mysql-5.6-en/ha-memcached-using-threads.html}:
+When enabled, the threading implementation operates as follows~\cite[\S~16.2.2.8]{MemcachedThreading}:
 \begin{itemize}
 \item
 …
 For UDP connections, all the threads listen to a single UDP socket for incoming requests.
 Threads that are not currently dealing with another request ignore the incoming packet.
 One of the remaining, nonbusy, threads reads the request and sends the response.
 This implementation can lead to increased CPU load as threads wake from sleep to potentially process the request.
 \end{itemize}
 Here, Memcached is based on an event-based webserver architecture~\cite{Pai99Flash}, using \gls{kthrd}ing to run multiple largely independent event engines, and if needed, spinning up additional kernel threads to handle blocking I/O.
 Alternative webserver architecture are:
+One of the remaining, non-busy, threads reads the request and sends the response.
+This implementation can lead to increased CPU \gls{load} as threads wake from sleep to potentially process the request.
+\end{itemize}
+Here, Memcached is based on an event-based web server architecture~\cite{Pai99Flash}, using \gls{kthrd}ing to run multiple largely independent event engines, and if needed, spinning up additional kernel threads to handle blocking I/O.
+Alternative web server architectures are:
 \begin{itemize}
 \item
 …
  \item \emph{vanilla}: the official release of Memcached, version~1.6.9.
  \item \emph{fibre}: a modification of vanilla using the thread-per-connection model on top of the libfibre runtime.
  \item \emph{cfa}: a modification of the fibre webserver that replaces the libfibre runtime with \CFA.
+ \item \emph{cfa}: a modification of the fibre web server that replaces the libfibre runtime with \CFA.
 \end{itemize}
 …
 This experiment is done by having the clients establish 15,360 total connections, which persist for the duration of the experiment.
 The clients then send read and write queries with only 3\% writes (updates), attempting to follow a desired query rate, and the server responds to the desired rate as best as possible.
 Figure~\ref{fig:memcd:rate:qps} shows the 3 server versions at different client rates, ``Target \underline{Q}ueries \underline{P}er \underline{S}econd'', and the actual rate, ``Actual QPS'', for all three webservers.
 Like the experimental setup in Chapter~\ref{microbench}, each experiment is run 15 times, and for each client rate, the measured webserver rate is plotted.
+Figure~\ref{fig:memcd:rate:qps} shows the 3 server versions at different client rates, ``Target \underline{Q}ueries \underline{P}er \underline{S}econd'', and the actual rate, ``Actual QPS'', for all three web servers.
+Like the experimental setup in Chapter~\ref{microbench}, each experiment is run 15 times, and for each client rate, the measured web server rate is plotted.
 The solid line represents the median while the dashed and dotted lines represent the maximum and minimum respectively.
 For rates below 500K queries per seconds, all three webservers match the client rate.
 Beyond 500K, the webservers cannot match the client rate.
 During this interval, vanilla Memcached achieves the highest webserver throughput, with libfibre and \CFA slightly lower but very similar throughput.
 Overall the performance of all three webservers is very similar, especially considering that at 500K the servers have reached saturation, which is discussed more in the next section.
+For rates below 500K queries per second, all three web servers match the client rate.
+Beyond 500K, the web servers cannot match the client rate.
+During this interval, vanilla Memcached achieves the highest web server throughput, with libfibre and \CFA slightly lower but very similar throughput.
+Overall the performance of all three web servers is very similar, especially considering that at 500K the servers have reached saturation, which is discussed more in the next section.
 \begin{figure}
         \centering
         \resizebox{0.83\linewidth}{!}{\input{result.memcd.rate.qps.pstex_t}}
         \caption[Memcached Benchmark: Throughput]{Memcached Benchmark: Throughput\smallskip\newline Desired vs Actual query rate for 15,360 connections. Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server is able to respond.}
+        \caption[Memcached Benchmark: Throughput]{Memcached Benchmark: Throughput\smallskip\newline Desired vs Actual query rate for 15,360 connections. Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server can respond.}
         \label{fig:memcd:rate:qps}
 %\end{figure}
 …
         \centering
         \resizebox{0.83\linewidth}{!}{\input{result.memcd.rate.99th.pstex_t}}
         \caption[Memcached Benchmark : 99th Percentile Lantency]{Memcached Benchmark : 99th Percentile Lantency\smallskip\newline 99th Percentile of the response latency as a function of \emph{desired} query rate for 15,360 connections. }
+        \caption[Memcached Benchmark: 99th Percentile Latency]{Memcached Benchmark: 99th Percentile Latency\smallskip\newline 99th Percentile of the response latency as a function of \emph{desired} query rate for 15,360 connections. }
         \label{fig:memcd:rate:tail}
 \end{figure}
 \subsection{Tail Latency}
 Another popular performance metric is \newterm{tail} latency, which indicates some notion of fairness among requests across the experiment, \ie do some requests wait longer than other requests for service.
+Another popular performance metric is \newterm{tail} latency, which indicates some notion of fairness among requests across the experiment, \ie do some requests wait longer than other requests for service?
 Since many web applications rely on a combination of different queries made in parallel, the latency of the slowest response, \ie tail latency, can dictate a performance perception.
 Figure~\ref{fig:memcd:rate:tail} shows the 99th percentile latency results for the same Memcached experiment.
 Again, each experiment is run 15 times with the median, maximum and minimum plotted with different lines.
 As expected, the latency starts low and increases as the server gets close to saturation, at which point, the latency increases dramatically because the webservers cannot keep up with the connection rate so client requests are disproportionally delayed.
 Because of this dramatic increase, the Y axis is presented using log scale.
 Note that the graph shows \emph{target} query rate, the actual response rate is given in Figure~\ref{fig:memcd:rate:qps} as this is the same underlying experiment.
 For all three servers, the saturation point is reached before 500K queries per second, which is when throughput starts to decline among the webservers.
 In this experiment, all three webservers are much more distinguishable than the throughput experiment.
 Vanilla Memcached achieves the lowest latency until 600K, after which all the webservers are struggling to respond to client requests.
+As expected, the latency starts low and increases as the server gets close to saturation, at which point, the latency increases dramatically because the web servers cannot keep up with the connection rate so client requests are disproportionally delayed.
+Because of this dramatic increase, the Y-axis is presented using a log scale.
+Note that the graph shows the \emph{target} query rate, the actual response rate is given in Figure~\ref{fig:memcd:rate:qps} as this is the same underlying experiment.
+For all three servers, the saturation point is reached before 500K queries per second, which is when throughput starts to decline among the web servers.
+In this experiment, all three web servers are much more distinguishable than in the throughput experiment.
+Vanilla Memcached achieves the lowest latency until 600K, after which all the web servers are struggling to respond to client requests.
 \CFA begins to decline at 600K, indicating some bottleneck after saturation.
 Overall, all three webservers achieve micro-second latencies and the increases in latency mostly follow each other.
+Overall, all three web servers achieve microsecond latencies and the increases in latency mostly follow each other.
 \subsection{Update rate}
 Since Memcached is effectively a simple database, the information that is cached can be written to concurrently by multiple queries.
+Since Memcached is effectively a simple database, the cache information can be written to concurrently by multiple queries.
 And since writes can significantly affect performance, it is interesting to see how varying the update rate affects performance.
 Figure~\ref{fig:memcd:updt} shows the results for the same experiment as the throughput and latency experiment but increasing the update percentage to 5\%, 10\% and 50\%, respectively, versus the original 3\% update percentage.
 \begin{figure}
+        \hspace{-15pt}
         \subfloat[][\CFA: Throughput]{
                 \resizebox{0.5\linewidth}{!}{
 …
+        }
         \subfloat[][\CFA: Latency]{
                 \resizebox{0.5\linewidth}{!}{
+                \resizebox{0.52\linewidth}{!}{
                         \input{result.memcd.forall.lat.pstex_t}
+                }
 …
+        }
+        \hspace{-15pt}
         \subfloat[][LibFibre: Throughput]{
                 \resizebox{0.5\linewidth}{!}{
 …
+        }
         \subfloat[][LibFibre: Latency]{
                 \resizebox{0.5\linewidth}{!}{
+                \resizebox{0.52\linewidth}{!}{
                         \input{result.memcd.fibre.lat.pstex_t}
+                }
 …
+        }
+        \hspace{-15pt}
         \subfloat[][Vanilla: Throughput]{
                 \resizebox{0.5\linewidth}{!}{
 …
+        }
         \subfloat[][Vanilla: Latency]{
                 \resizebox{0.5\linewidth}{!}{
+                \resizebox{0.52\linewidth}{!}{
                         \input{result.memcd.vanilla.lat.pstex_t}
+                }
                 \label{fig:memcd:updt:vanilla:lat}
+        }
+        \caption[Throughput and Latency results at different update rates (percentage of writes).]{Throughput and Latency results at different update rates (percentage of writes).\smallskip\newline Description}
+        \caption[Throughput and Latency results at different update rates (percentage of writes).]{Throughput and Latency results at different update rates (percentage of writes).\smallskip\newline On the left, throughput as Desired vs Actual query rate.
+        Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server can respond.
+        On the right, tail latency, \ie 99th Percentile of the response latency as a function of \emph{desired} query rate.
+        For throughput, higher is better, for tail-latency, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
+        All runs have 15,360 client connections.
         \label{fig:memcd:updt}
 \end{figure}
 …
 \section{Static Web-Server}
 The Memcached experiment does not exercise two key aspects of the \io subsystem: accept\-ing new connections and interacting with disks.
 On the other hand, a webserver servicing static web-pages does stress both accepting connections and disk \io by accepting tens of thousands of client requests per second where these requests return static data serviced from the file-system cache or disk.\footnote{
 Webservers servicing dynamic requests, which read from multiple locations and construct a response, are not as interesting since creating the response takes more time and does not exercise the runtime in a meaningfully different way.}
 The static webserver experiment compares NGINX~\cite{nginx} with a custom \CFA-based webserver developed for this experiment.
+On the other hand, a web server servicing static web pages does stress both accepting connections and disk \io by accepting tens of thousands of client requests per second where these requests return static data serviced from the file-system cache or disk.\footnote{
+web servers servicing dynamic requests, which read from multiple locations and construct a response, are not as interesting since creating the response takes more time and does not exercise the runtime in a meaningfully different way.}
+The static web server experiment compares NGINX~\cite{nginx} with a custom \CFA-based web server developed for this experiment.
 \subsection{NGINX threading}
+Like memcached, NGINX can be makde to use multiple \glspl{kthrd}.
+It has a very similar architecture to the memcached architecture decscribed in Section~\ref{memcd:thrd}, where multiple \glspl{kthrd} each run a mostly independent network logic.
+While it does not necessarily use a dedicated listening thread, each connection is arbitrarily assigned to one of the \newterm{worker} threads.
+Each worker threads handles multiple connections exclusively, effectively dividing the connections into distinct sets.
+Again, this is effectively the \emph{event-based server} approach.
+\cit{https://www.nginx.com/blog/inside-nginx-how-we-designed-for-performance-scale/}
+\subsection{\CFA webserver}
+The \CFA webserver is a straightforward thread-per-connection webserver, where a fixed number of \ats are created upfront.
+NGINX is a high-performance, \emph{full-service}, event-driven web server.
+It can handle both static and dynamic web content, as well as serve as a reverse proxy and a load balancer~\cite{reese2008nginx}.
+This wealth of capabilities comes with a variety of potential configurations, dictating available features and performance.
+The NGINX server runs a master process that performs operations such as reading configuration files, binding to ports, and controlling worker processes.
+When running as a static web server, it uses an event-driven architecture to service incoming requests.
+Incoming connections are assigned a \emph{stackless} HTTP state machine and worker processes can handle thousands of these state machines.
+For the following experiment, NGINX is configured to use @epoll@ to listen for events on these state machines and have each worker process independently accept new connections.
+Because of the realities of Linux, see Subsection~\ref{ononblock}, NGINX also maintains a pool of auxiliary threads to handle blocking \io.
+The configuration can set the number of worker processes desired, as well as the size of the auxiliary pool.
+However, for the following experiments, NGINX is configured to let the master process decide the appropriate number of threads.
+\subsection{\CFA web server}
+The \CFA web server is a straightforward thread-per-connection web server, where a fixed number of \ats are created upfront.
 Each \at calls @accept@, through @io_uring@, on the listening port and handles the incoming connection once accepted.
 Most of the implementation is fairly straightforward;
 however, the inclusion of file \io found an @io_uring@ problem that required an unfortunate workaround.
 Normally, webservers use @sendfile@~\cite{MAN:sendfile} to send files over a socket because it performs a direct move in the kernel from the file-system cache to the NIC, eliminating reading/writing the file into the webserver.
 While @io_uring@ does not support @sendfile@, it does supports @splice@~\cite{MAN:splice}, which is strictly more powerful.
 However, because of how Linux implements file \io, see Subsection~\ref{ononblock}, @io_uring@ must delegate splice calls to worker threads inside the kernel.
+Normally, web servers use @sendfile@~\cite{MAN:sendfile} to send files over a socket because it performs a direct move in the kernel from the file-system cache to the NIC, eliminating reading/writing the file into the web server.
+While @io_uring@ does not support @sendfile@, it does support @splice@~\cite{MAN:splice}, which is strictly more powerful.
+However, because of how Linux implements file \io, see Subsection~\ref{ononblock}, @io_uring@ must delegate splice calls to worker threads \emph{inside} the kernel.
 As of Linux 5.13, @io_uring@ had no mechanism to restrict the number of worker threads, and therefore, when tens of thousands of splice requests are made, it correspondingly creates tens of thousands of internal \glspl{kthrd}.
 Such a high number of \glspl{kthrd} slows Linux significantly.
 Rather than abandon the experiment, the \CFA webserver was switched to @sendfile@.
 With a blocking @sendfile@ the \CFA achieves acceptable performance until saturation is reached.
 At saturation, latency increases so some client connections timeout.
+Rather than abandon the experiment, the \CFA web server was switched to @sendfile@.
+Starting with \emph{blocking} @sendfile@, \CFA achieves acceptable performance until saturation is reached.
+At saturation, latency increases and client connections begin to timeout.
 As these clients close their connection, the server must close its corresponding side without delay so the OS can reclaim the resources used by these connections.
 Indeed, until the server connection is closed, the connection lingers in the CLOSE-WAIT TCP state~\cite{rfc:tcp} and the TCP buffers are preserved.
 However, this poses a problem using nonblocking @sendfile@ calls:
+However, this poses a problem using blocking @sendfile@ calls:
 when @sendfile@ blocks, the \proc rather than the \at blocks, preventing other connections from closing their sockets.
 The call can block if there is insufficient memory, which can be caused by having too many connections in the CLOSE-WAIT state.\footnote{
 \lstinline{sendfile} can always block even in nonblocking mode if the file to be sent is not in the file-system cache, because Linux does not provide nonblocking disk I/O.}
+This effect results in a negative feedback where more timeouts lead to more @sendfile@ calls running out of resources.
+Normally, this is address by using @select@/@epoll@ to wait for sockets to have sufficient resources.
+However, since @io_uring@ respects nonblocking semantics, marking all sockets as non-blocking effectively circumvents the @io_uring@ subsystem entirely:
+all calls would simply immediately return @EAGAIN@ and all asynchronicity would be lost.
+For this reason, the \CFA webserver sets and resets the @O_NONBLOCK@ flag before and after any calls to @sendfile@.
+This effect results in a negative feedback loop where more timeouts lead to more @sendfile@ calls running out of resources.
+Normally, this problem is addressed by using @select@/@epoll@ to wait for sockets to have sufficient resources.
+However, since @io_uring@ does not support @sendfile@ but does respect non\-blocking semantics, marking all sockets as non-blocking effectively circumvents the @io_uring@ subsystem entirely:
+all calls simply immediately return @EAGAIN@ and all asynchronicity is lost.
+Switching the entire \CFA runtime to @epoll@ for this experiment is unrealistic and does not help in the evaluation of the \CFA runtime.
+For this reason, the \CFA web server sets and resets the @O_NONBLOCK@ flag before and after any calls to @sendfile@.
 However, when the nonblocking @sendfile@ returns @EAGAIN@, the \CFA server cannot block the \at because its I/O subsystem uses @io_uring@.
+Therefore, the \at must spin performing the @sendfile@ and yield if the call returns @EAGAIN@.
+Normally @epoll@ would also be used when these calls to @sendfile@ return @EAGAIN@, but since this would not help in the evaluation of the \CFA runtime, the \CFA webserver simply yields and retries in these cases.
+Interestingly, Linux 5.15 @io_uring@ introduces the ability to limit the number of worker threads that are created, through the @IORING_REGISTER_IOWQ_MAX_WORKERS@ option.
+Presumably, this limit could prevent the explosion of \glspl{kthrd} which justified using @sendfile@ over @io_uring@ and @splice@.
+Therefore, the \at spins performing the @sendfile@, yields if the call returns @EAGAIN@ and retries in these cases.
+Interestingly, Linux 5.15 @io_uring@ introduces the ability to limit the number of worker threads that are created through the @IORING_REGISTER_IOWQ_MAX_WORKERS@ option.
+Presumably, this limit would prevent the explosion of \glspl{kthrd}, which justified using @sendfile@ over @io_uring@ and @splice@.
 However, recall from Section~\ref{iouring} that @io_uring@ maintains two pools of workers: bounded workers and unbounded workers.
+In the particular case of the webserver, we would want the unbounded workers to handle accepts and reads on socket and bounded workers to handle reading the files from disk.
+This would allow fine grained countrol over the number of workers needed for each operation type and would presumably lead to good performance.
+For a web server, the unbounded workers should handle accepts and reads on sockets, and the bounded workers should handle reading files from disk.
+This setup allows fine-grained control over the number of workers needed for each operation type and presumably leads to good performance.
 However, @io_uring@ must contend with another reality of Linux: the versatility of @splice@.
 Indeed, @splice@ can be used both for reading and writing, to or from any type of file descriptor.
 This makes it more ambiguous which pool @io_uring@ should delegate @splice@ calls to.
 In the case of splicing from a socket to pipe, @splice@ will behave like an unbounded operation, but when splicing from a regular file to a pipe, @splice@ becomes a bounded operation.
 To make things more complicated, @splice@ can read from a pipe and write out to a regular file.
+Indeed, @splice@ can be used both for reading and writing to or from any type of file descriptor.
+This generality makes it ambiguous which pool @io_uring@ should delegate @splice@ calls to.
+In the case of splicing from a socket to a pipe, @splice@ behaves like an unbounded operation, but when splicing from a regular file to a pipe, @splice@ becomes a bounded operation.
+To make things more complicated, @splice@ can read from a pipe and write to a regular file.
 In this case, the read is an unbounded operation but the write is a bounded one.
 This leaves @io_uring@ in a difficult situation where it can be very difficult to delegate splice operations to the appropriate type of worker.
 Since there is little to no context available to @io_uring@, I believe it makes the decision to always delegate @splice@ operations to the unbounded workers.
 This is unfortunate for this specific experiment, since it prevents the webserver from limiting the number of calls to @splice@ happening in parallel without affecting the performance of @read@ or @accept@.
+Since there is little or no context available to @io_uring@, it seems to always delegate @splice@ operations to the unbounded workers.
+This decision is unfortunate for this specific experiment since it prevents the web server from limiting the number of parallel calls to @splice@ without affecting the performance of @read@ or @accept@.
 For this reason, the @sendfile@ approach described above is still the most performant solution in Linux 5.15.
+Note that it could be possible to workaround this problem, for example by creating more @io_uring@ instances so @splice@ operations can be issued to a different instance than the @read@ and @accept@ operations.
+However, I do not believe this solution is appropriate in general, it simply replaces a hack in the webserver with a different, equivalent hack.
+One possible workaround is to create more @io_uring@ instances so @splice@ operations can be issued to a different instance than the @read@ and @accept@ operations.
+However, I do not believe this solution is appropriate in general;
+it simply replaces my current web server hack with a different, equivalent hack.
 \subsection{Benchmark Environment}
 Unlike the Memcached experiment, the webserver experiment is run on a heterogeneous environment.
+Unlike the Memcached experiment, the web server experiment is run on a heterogeneous environment.
 \begin{itemize}
 \item
 The server runs Ubuntu 20.04.4 LTS on top of Linux Kernel 5.13.0-52.
 \item
+It has an AMD Opteron(tm) Processor 6380 running at 2.5GHz.
+The server computer has four AMD Opteron\texttrademark Processor 6380 with 16 cores running at 2.5GHz, for a total of 64 \glspl{hthrd}.
+\item
+The computer is booted with only 8 CPUs enabled, which is sufficient to achieve line rate.
 \item
 Each CPU has 64 KB, 256 KiB and 8 MB of L1, L2 and L3 caches respectively.
 \item
-The computer is booted with only 8 CPUs enabled, which is sufficient to achieve line rate.
-\item
 The computer is booted with only 25GB of memory to restrict the file-system cache.
 \end{itemize}
 …
 \begin{itemize}
 \item
 A client runs a 2.6.11-1 SMP Linux kernel, which permits each client load-generator to run on a separate CPU.
+A client runs a 2.6.11-1 SMP Linux kernel, which permits each client load generator to run on a separate CPU.
 \item
 It has two 2.8 GHz Xeon CPUs, and four one-gigabit Ethernet cards.
 \item
+\todo{switch}
+Network routing is performed by an HP 2530 10 Gigabit Ethernet switch.
 \item
 A client machine runs two copies of the workload generator.
 \end{itemize}
 The clients and network are sufficiently provisioned to drive the server to saturation and beyond.
 Hence, any server effects are attributable solely to the runtime system and webserver.
 Finally, without restricting the server hardware resources, it is impossible to determine if a runtime system or the webserver using it has any specific design restrictions, \eg using space to reduce time.
 Trying to determine these restriction with large numbers of processors or memory simply means running equally large experiments, which takes longer and are harder to set up.
+Hence, any server effects are attributable solely to the runtime system and web server.
+Finally, without restricting the server hardware resources, it is impossible to determine if a runtime system or the web server using it has any specific design restrictions, \eg using space to reduce time.
+Trying to determine these restrictions with large numbers of processors or memory simply means running equally large experiments, which take longer and are harder to set up.
 \subsection{Throughput}
 To measure webserver throughput, the server computer is loaded with 21,600 files, sharded across 650 directories, occupying about 2.2GB of disk, distributed over the server's RAID-5 4-drives to achieve high throughput for disk I/O.
+To measure web server throughput, the server computer is loaded with 21,600 files, sharded across 650 directories, occupying about 2.2GB of disk, distributed over the server's RAID-5 4-drives to achieve high throughput for disk I/O.
 The clients run httperf~\cite{httperf} to request a set of static files.
 The httperf load-generator is used with session files to simulate a large number of users and to implement a partially open-loop system.
+The httperf load generator is used with session files to simulate a large number of users and to implement a partially open-loop system.
 This permits httperf to produce overload conditions, generate multiple requests from persistent HTTP/1.1 connections, and include both active and inactive off periods to model browser processing times and user think times~\cite{Barford98}.
 The experiments are run with 16 clients, each running a copy of httperf (one copy per CPU), requiring a set of 16 log files with requests conforming to a Zipf distribution.
 This distribution is representative of users accessing static data through a web-browser.
 Each request reads a file name from its trace, establishes a connection, performs an HTTP get-request for the file name, receive the file data, close the connection, and repeat the process.
+This distribution is representative of users accessing static data through a web browser.
+Each request reads a file name from its trace, establishes a connection, performs an HTTP GET request for the file name, receives the file data, closes the connection, and repeats the process.
 Some trace elements have multiple file names that are read across a persistent connection.
 A client times-out if the server does not complete a request within 10 seconds.
+A client times out if the server does not complete a request within 10 seconds.
 An experiment consists of running a server with request rates ranging from 10,000 to 70,000 requests per second;
 each rate takes about 5 minutes to complete.
 There is 20 seconds idle time between rates and between experiments to allow connections in the TIME-WAIT state to clear.
+There are 20 seconds of idle time between rates and between experiments to allow connections in the TIME-WAIT state to clear.
 Server throughput is measured both at peak and after saturation (\ie after peak).
 Peak indicates the level of client requests the server can handle and after peak indicates if a server degrades gracefully.
 Throughput is measured by aggregating the results from httperf of all the clients.
+Throughput is measured by aggregating the results from httperf for all the clients.
 This experiment can be done for two workload scenarios by reconfiguring the server with different amounts of memory: 25 GB and 2.5 GB.
 …
 \end{table}
-Figure~\ref{fig:swbsrv} shows the results comparing \CFA to NGINX in terms of throughput.
-These results are fairly straightforward.
-Both servers achieve the same throughput until around 57,500 requests per seconds.
-Since the clients are asking for the same files, the fact that the throughput matches exactly is expected as long as both servers are able to serve the desired rate.
-Once the saturation point is reached, both servers are still very close.
-NGINX achieves slightly better throughput.
-However, Figure~\ref{fig:swbsrv:err} shows the rate of errors, a gross approximation of tail latency, where \CFA achieves notably fewer errors once the machine reaches saturation.
-This suggest that \CFA is slightly more fair and NGINX may slightly sacrifice some fairness for improved throughput.
-It demonstrate that the \CFA webserver described above is able to match the performance of NGINX up-to and beyond the saturation point of the machine.
 \begin{figure}
+        \centering
         \subfloat[][Throughput]{
                 \resizebox{0.85\linewidth}{!}{\input{result.swbsrv.25gb.pstex_t}}
 …
                 \label{fig:swbsrv:err}
+        }
         \caption[Static Webserver Benchmark : Throughput]{Static Webserver Benchmark : Throughput\smallskip\newline Throughput vs request rate for short lived connections connections.}
+        \caption[Static web server Benchmark: Throughput]{Static web server Benchmark: Throughput\smallskip\newline Throughput vs request rate for short-lived connections.}
         \label{fig:swbsrv}
 \end{figure}
+Figure~\ref{fig:swbsrv} shows the results comparing \CFA to NGINX in terms of throughput.
+These results are fairly straightforward.
+Both servers achieve the same throughput until around 57,500 requests per second.
+Since the clients are asking for the same files, the fact that the throughput matches exactly is expected as long as both servers are able to serve the request rate.
+Once the saturation point is reached, both servers are still very close.
+NGINX achieves slightly better throughput.
+However, Figure~\ref{fig:swbsrv:err} shows the rate of errors, a gross approximation of tail latency, where \CFA achieves notably fewer errors once the servers reach saturation.
+This suggests \CFA is slightly fairer with less throughput, while NGINX sacrifices fairness for more throughput.
+This experiment demonstrates that the \CFA web server is able to match the performance of NGINX up to and beyond the saturation point of the machine.
 \subsection{Disk Operations}
+The throughput was made using a server with 25gb of memory, this was sufficient to hold the entire fileset in addition to all the code and data needed to run the webserver and the rest of the machine.
+Previous work like \cit{Cite Ashif's stuff} demonstrate that an interesting follow-up experiment is to rerun the same throughput experiment but allowing significantly less memory on the machine.
+If the machine is constrained enough, it will force the OS to evict files from the file cache and cause calls to @sendfile@ to have to read from disk.
+However, in this configuration, the problem with @splice@ and @io_uring@ rears its ugly head again.
+With 25GB of memory, the entire experimental file-set plus the web server and OS fit in memory.
+If memory is constrained, the OS must evict files from the file cache, which causes @sendfile@ to read from disk.\footnote{
+For the in-memory experiments, the file-system cache was warmed by running an experiment three times before measuring started to ensure all files are in the file-system cache.}
+web servers can behave very differently once file I/O begins and increases.
+Hence, prior work~\cite{Harji10} suggests running both kinds of experiments to test overall web server performance.
+However, after reducing memory to 2.5GB, the problem with @splice@ and @io_uring@ rears its ugly head again.
 Indeed, in the in-memory configuration, replacing @splice@ with calls to @sendfile@ works because the bounded side basically never blocks.
 Like @splice@, @sendfile@ is in a situation where the read side requires bounded blocking, \eg reading from a regular file, while the write side requires unbounded blocking, \eg blocking until the socket is available for writing.
 The unbounded side can be handled by yielding when it returns @EAGAIN@ like mentioned above, but this trick does not work for the bounded side.
+The unbounded side can be handled by yielding when it returns @EAGAIN@, as mentioned above, but this trick does not work for the bounded side.
 The only solution for the bounded side is to spawn more threads and let these handle the blocking.
 Supporting this case in the webserver would require creating more \procs or creating a dedicated thread-pool.
 However, since what I am to evaluate in this thesis is the runtime of \CFA, I decided to forgo experiments on low memory server.
 The implementation of the webserver itself is simply too impactful to be an interesting evaluation of the underlying runtime.
+Supporting this case in the web server would require creating more \procs or creating a dedicated thread pool.
+However, I felt this kind of modification moves too far away from my goal of evaluating the \CFA runtime, \ie it begins writing another runtime system;
+hence, I decided to forgo experiments on low-memory performance.

doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex

-              rebf8ca5
+              r23a08aa0
 This chapter presents five different experimental setups for evaluating the basic features of the \CFA, libfibre~\cite{libfibre}, Go, and Tokio~\cite{Tokio} schedulers.
 All of these systems have a \gls{uthrding} model.
 The goal in this chapter is show the \CFA scheduler obtains equivalent performance to other less fair schedulers through the different experiments.
 Note, only the code of the \CFA tests is shown;
 all tests in the other systems are functionally identical and available online~\cite{SchedulingBenchmarks}.
+The goal of this chapter is to show that the \CFA scheduler obtains equivalent performance to other, less fair, schedulers through the different experiments.
+Note that only the code of the \CFA tests is shown;
+all tests in the other systems are functionally identical and available online~\cite{GITHUB:SchedulingBenchmarks}.
 \section{Benchmark Environment}\label{microenv}
 …
 \begin{description}
 \item[AMD] is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM.
 The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}.
+The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for a total of 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}.
 Each CPU has 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches, respectively.
 Each L1 and L2 instance are only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}.
+Each L1 and L2 instance is only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}.
 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55.
 …
 \end{description}
 For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA node with no hyper threading.
+For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA node with no hyperthreading.
 If more \glspl{hthrd} are needed, then 1 NUMA node with hyperthreading is used.
 If still more \glspl{hthrd} are needed, then the experiment is limited to as few NUMA nodes as needed.
 …
 On AMD, the same algorithm is used, but the machine only has 2 sockets.
 So hyperthreading\footnote{
 Hyperthreading normally refers specifically to the technique used by Intel, however it is often used generically to refer to any equivalent feature.}
 is used when the \proc count reach 65 and 193.
 The limited sharing of the last-level cache on the AMD machine is markedly different than the Intel machine.
+Hyperthreading normally refers specifically to the technique used by Intel, however, it is often used generically to refer to any equivalent feature.}
+is used when the \proc count reaches 65 and 193.
+The limited sharing of the last-level cache on the AMD machine is markedly different from the Intel machine.
 Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different CPU incur a significant latency, on the AMD it is also the case that cache misses served by a different L3 instance on the same CPU also incur high latency.
 …
 Each experiment is run 15 times varying the number of processors depending on the two different computers.
 All experiments gather throughput data and secondary data for scalability or latency.
 The data is graphed using a solid and two dashed lines representing the median, maximum and minimum result respectively, where the minimum/maximum lines are referred to as the \emph{extremes}.\footnote{
+The data is graphed using a solid, a dashed, and a dotted line, representing the median, maximum and minimum results respectively, where the minimum/maximum lines are referred to as the \emph{extremes}.\footnote{
 An alternative display is to use error bars with min/max as the bottom/top for the bar.
 However, this approach is not truly an error bar around a mean value and I felt the connected lines are easier to read.}
 …
 For each experiment, four graphs are generated showing traditional throughput on the top row and \newterm{scalability} or \newterm{latency} on the bottom row (peek ahead to Figure~\ref{fig:cycle:jax}).
 Scalability uses the same data as throughput but the Y axis is calculated as the number of \procs over the throughput.
+Scalability uses the same data as throughput but the Y-axis is calculated as the number of \procs over the throughput.
 In this representation, perfect scalability should appear as a horizontal line, \eg, if doubling the number of \procs doubles the throughput, then the relation stays the same.
 The left column shows results for 100 cycles per \proc, enough cycles to always keep every \proc busy.
 The right column shows results for 1 cycle per \proc, where the ready queues are expected to be near empty most of the time.
 The distinction between 100 and 1 cycles is meaningful because the idle sleep subsystem is expected to matter only in the right column, where spurious effects can cause a \proc to run out of work temporarily.
+The left column shows results for hundreds of \ats per \proc, enough to always keep every \proc busy.
+The right column shows results for very few \ats per \proc, where the ready queues are expected to be near empty most of the time.
+The distinction between many and few \ats is meaningful because the idle sleep subsystem is expected to matter only in the right column, where spurious effects can cause a \proc to run out of work temporarily.
 \section{Cycle}
 …
 Hence, systems that perform this optimization have an artificial performance benefit because the yield becomes a \emph{nop}.
 For this reason, I designed a different push/pop benchmark, called \newterm{Cycle Benchmark}.
 This benchmark arranges a number of \ats into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list.
 At runtime, each \at unparks the next \at before parking itself.
 Unparking the next \at pushes that \at onto the ready queue while the ensuing park leads to a \at being popped from the ready queue.
+This benchmark arranges several \ats into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list.
+At runtime, each \at unparks the next \at before \glslink{atblock}{parking} itself.
+Unparking the next \at pushes that \at onto the ready queue while the ensuing \park leads to a \at being popped from the ready queue.
 \begin{figure}
         \centering
         \input{cycle.pstex_t}
         \caption[Cycle benchmark]{Cycle benchmark\smallskip\newline Each \at unparks the next \at in the cycle before parking itself.}
+        \caption[Cycle benchmark]{Cycle benchmark\smallskip\newline Each \at unparks the next \at in the cycle before \glslink{atblock}{parking} itself.}
         \label{fig:cycle}
 \end{figure}
 Therefore, the underlying runtime cannot rely on the number of ready \ats staying constant over the duration of the experiment.
 In fact, the total number of \ats waiting on the ready queue is expected to vary because of the race between the next \at unparking and the current \at parking.
+In fact, the total number of \ats waiting on the ready queue is expected to vary because of the race between the next \at \glslink{atsched}{unparking} and the current \at \glslink{atblock}{parking}.
 That is, the runtime cannot anticipate that the current task immediately parks.
 As well, the size of the cycle is also decided based on this race, \eg a small cycle may see the chain of unparks go full circle before the first \at parks because of time-slicing or multiple \procs.
 If this happens, the scheduler push and pop are avoided and the results of the experiment are skewed.
 (Note, an unpark is like a V on a semaphore, so the subsequent park (P) may not block.)
 Every runtime system must handle this race and cannot optimized away the ready-queue pushes and pops.
 To prevent any attempt of silently omitting ready-queue operations, the ring of \ats is made big enough so the \ats have time to fully park before being unparked again.
+(Note, an \unpark is like a V on a semaphore, so the subsequent \park (P) may not block.)
+Every runtime system must handle this race and cannot optimize away the ready-queue pushes and pops.
+To prevent any attempt of silently omitting ready-queue operations, the ring of \ats is made big enough so the \ats have time to fully \park before being unparked again.
 Finally, to further mitigate any underlying push/pop optimizations, especially on SMP machines, multiple rings are created in the experiment.
 Figure~\ref{fig:cycle:code} shows the pseudo code for this benchmark, where each cycle has 5 \ats.
 There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw @park@/@unpark@ and carefully picking the order of the @P@ and @V@ with respect to the loop condition.
+There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw \park/\unpark and carefully picking the order of the @P@ and @V@ with respect to the loop condition.
 \begin{figure}
 …
+}
 \end{cfa}
 \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark : Pseudo Code}
+\caption[Cycle Benchmark: Pseudo Code]{Cycle Benchmark: Pseudo Code}
 \label{fig:cycle:code}
-%\end{figure}
 \bigskip
-%\begin{figure}
         \subfloat[][Throughput, 100 cycles per \proc]{
                 \resizebox{0.5\linewidth}{!}{
 …
                 \label{fig:cycle:jax:low:ns}
+        }
+        \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are maximums while the solid line is the medium.}
+        \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle counts.
+        For throughput, higher is better, for scalability, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
         \label{fig:cycle:jax}
 \end{figure}
 …
                 \label{fig:cycle:nasus:low:ns}
+        }
+        \caption[Cycle Benchmark on AMD]{Cycle Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
+        \caption[Cycle Benchmark on AMD]{Cycle Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle counts.
+        For throughput, higher is better, for scalability, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
         \label{fig:cycle:nasus}
 \end{figure}
 …
 \subsection{Results}
+For the Intel architecture, Figure~\ref{fig:cycle:jax}:
+\begin{itemize}
+\item
+For 100 cycles per \proc (first column), \CFA, Go and Tokio all obtain effectively the same throughput performance.
+Figures~\ref{fig:cycle:jax} and \ref{fig:cycle:nasus} show the results for the cycle experiment on Intel and AMD, respectively.
+Looking at the left column on Intel, Figures~\ref{fig:cycle:jax:ops} and \ref{fig:cycle:jax:ns} show the results for 100 cycles of 5 \ats for each \proc.
+\CFA, Go and Tokio all obtain effectively the same throughput performance.
 Libfibre is slightly behind in this case but still scales decently.
+As a result of the \gls{kthrd} placement, additional \procs from 25 to 48 offer less performance improvement (flatting of the line) for all runtimes.
+As expected, this pattern repeats again between \proc count 72 and 96.
+\item
+For 1 cycle per \proc, \CFA and Tokio obtain very similar results overall, but Tokio shows more variations in the results.
+Go achieves slightly better performance.
+As a result of the \gls{kthrd} placement, additional \procs from 25 to 48 offer less performance improvement for all runtimes, which can be seen as a flattening of the line.
+This effect even causes a decrease in throughput in libfibre's case.
+As expected, this pattern repeats between \proc count 72 and 96.
+Looking next at the right column on Intel, Figures~\ref{fig:cycle:jax:low:ops} and \ref{fig:cycle:jax:low:ns} show the results for 1 cycle of 5 \ats for each \proc.
+\CFA and Tokio obtain very similar results overall, but Tokio shows more variations in the results.
+Go achieves slightly better performance than \CFA and Tokio, but all three display significantly worst performance compared to the left column.
+This decrease in performance is likely due to the additional overhead of the idle-sleep mechanism.
+This can either be the result of \procs actually running out of work or simply additional overhead from tracking whether or not there is work available.
+Indeed, unlike the left column, it is likely that the ready queue is transiently empty, which likely triggers additional synchronization steps.
 Interestingly, libfibre achieves better performance with 1 cycle.
+\end{itemize}
+For the AMD architecture, Figure~\ref{fig:cycle:nasus}, the results show the same story as on the Intel, with close to double the performance overall but with slightly increased variation.
+The different performance improvements and plateaus are due to cache topology and appear at the expected \proc counts of 64, 128 and 192, for the same reasons as on Intel.
+\begin{itemize}
+\item
+For 100 cycles per \proc, unlike Intel, all 4 runtimes achieve very similar throughput and scalability.
+\item
+For 1 cycle per \proc, unlike on Intel, Tokio and Go have the same throughput performance, while \CFA is slightly slower.
+Again, the same performance increase for libfibre is visible.
+\end{itemize}
+Looking now at the results for the AMD architecture, Figure~\ref{fig:cycle:nasus}, the results are overall similar to the Intel results, but with close to double the performance, slightly increased variation, and some differences in the details.
+Note the maximum of the Y-axis on Intel and AMD differ significantly.
+Looking at the left column on AMD, Figures~\ref{fig:cycle:nasus:ops} and \ref{fig:cycle:nasus:ns} all 4 runtimes achieve very similar throughput and scalability.
+However, as the number of \procs grows higher, the results on AMD show notably more variability than on Intel.
+The different performance improvements and plateaus are due to cache topology and appear at the expected: \proc counts of 64, 128 and 192, for the same reasons as on Intel.
+Looking next at the right column on AMD, Figures~\ref{fig:cycle:nasus:low:ops} and \ref{fig:cycle:nasus:low:ns}, Tokio and Go have the same throughput performance, while \CFA is slightly slower.
+This result is different than on Intel, where Tokio behaved like \CFA rather than behaving like Go.
+Again, the same performance increase for libfibre is visible when running fewer \ats.
 Note, I did not investigate the libfibre performance boost for 1 cycle in this experiment.
 The conclusion from both architectures is that all of the compared runtime have fairly equivalent performance for this micro-benchmark.
 Clearly, the pathological case with 1 \at per \proc, can affect fairness algorithms managing mostly idle processors, \eg \CFA, but only at high core counts.
 For this case, \emph{any} helping is likely to cause a cascade of \procs running out of work and attempting to steal.
 For this experiment, the \CFA scheduler has achieved the goal of obtaining equivalent performance to other less fair schedulers, except for very unusual workloads.
+The conclusion from both architectures is that all of the compared runtimes have fairly equivalent performance for this micro-benchmark.
+Clearly, the pathological case with 1 cycle per \proc can affect fairness algorithms managing mostly idle processors, \eg \CFA, but only at high core counts.
+In this case, \emph{any} helping is likely to cause a cascade of \procs running out of work and attempting to steal.
+For this experiment, the \CFA scheduler has achieved the goal of obtaining equivalent performance to other, less fair, schedulers.
 \section{Yield}
 For completion, the classic yield benchmark is included.
+For completeness, the classic yield benchmark is included.
 Here, the throughput is dominated by the mechanism used to handle the @yield@ function.
 Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, where the cycle @wait/next.wake@ is replaced by @yield@.
 …
+}
 \end{cfa}
 \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark : Pseudo Code}
+\caption[Yield Benchmark: Pseudo Code]{Yield Benchmark: Pseudo Code}
 \label{fig:yield:code}
 %\end{figure}
 …
                 \label{fig:yield:jax:ops}
+        }
         \subfloat[][Throughput, 1 \ats per \proc]{
+        \subfloat[][Throughput, 1 \at per \proc]{
                 \resizebox{0.5\linewidth}{!}{
                 \input{result.yield.low.jax.ops.pstex_t}
 …
                 \label{fig:yield:jax:ns}
+        }
         \subfloat[][Scalability, 1 \ats per \proc]{
+        \subfloat[][Scalability, 1 \at per \proc]{
                 \resizebox{0.5\linewidth}{!}{
                 \input{result.yield.low.jax.ns.pstex_t}
 …
                 \label{fig:yield:jax:low:ns}
+        }
+        \caption[Yield Benchmark on Intel]{Yield Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, using 1 \ats per \proc. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
+        \caption[Yield Benchmark on Intel]{Yield Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count.
+        For throughput, higher is better, for scalability, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
         \label{fig:yield:jax}
 \end{figure}
 …
 \subsection{Results}
+Figures~\ref{fig:yield:jax} and~\ref{fig:yield:nasus} show the same throughput graphs as @cycle@ on Intel and AMD, respectively.
+Note, the Y-axis on the yield graph for Intel is twice as large as the Intel cycle-graph.
+A visual glance between the cycle and yield graphs confirms my claim that the yield benchmark is unreliable.
+For the Intel architecture, Figure~\ref{fig:yield:jax}:
+\begin{itemize}
+\item
+Figures~\ref{fig:yield:jax} and \ref{fig:yield:nasus} show the results for the yield experiment on Intel and AMD, respectively.
+Looking at the left column on Intel, Figures~\ref{fig:yield:jax:ops} and \ref{fig:yield:jax:ns} show the results for 100 \ats for each \proc.
+Note that the Y-axis on this graph is twice as large as the Intel cycle graph.
+A visual glance between the left columns of the cycle and yield graphs confirms my claim that the yield benchmark is unreliable.
 \CFA has no special handling for @yield@, but this experiment requires less synchronization than the @cycle@ experiment.
+Hence, the @yield@ throughput and scalability graphs for both 100 and 1 cycles/tasks per processor have similar shapes to the corresponding @cycle@ graphs.
+The only difference is sightly better performance for @yield@ because of less synchronization.
+As for @cycle@, the cost of idle sleep also comes into play in a very significant way in Figure~\ref{fig:yield:jax:low:ns}, where the scaling is not flat.
+\item
+libfibre has special handling for @yield@ using the fact that the number of ready fibres does not change, and therefore, by-passing the idle-sleep mechanism entirely.
+Additionally, when only running 1 \at per \proc, libfibre optimizes further, and forgoes the context-switch entirely.
+Hence, libfibre behaves very differently in the cycle and yield benchmarks, with a 4 times increase in performance for 100 cycles/tasks and an 8 times increase for 1 cycle/task.
+\item
+Go has special handling for @yield@ by putting a yielding goroutine on a secondary global ready-queue, giving it lower priority.
+Hence, the @yield@ throughput and scalability graphs have similar shapes to the corresponding @cycle@ graphs.
+The only difference is slightly better performance for @yield@ because of less synchronization.
+Libfibre has special handling for @yield@ using the fact that the number of ready fibres does not change, and therefore, bypassing the idle-sleep mechanism entirely.
+Hence, libfibre behaves very differently in the cycle and yield benchmarks, with a 4 times increase in performance on the left column.
+Go has special handling for @yield@ by putting a yielding goroutine on a secondary global ready-queue, giving it a lower priority.
 The result is that multiple \glspl{hthrd} contend for the global queue and performance suffers drastically.
+Hence, Go behaves very differently in the cycle and yield benchmarks, with a complete performance collapse in @yield@ for both 100 and 1 cycles/tasks.
+\item
+Hence, Go behaves very differently in the cycle and yield benchmarks, with a complete performance collapse in @yield@.
 Tokio has a similar performance collapse after 16 processors, and therefore, its special @yield@ handling is probably related to a Go-like scheduler problem and/or a \CFA idle-sleep problem.
 (I did not dig through the Rust code to ascertain the exact reason for the collapse.)
+\end{itemize}
+Note that since there is no communication among \ats, locality problems are much less likely than for the cycle benchmark.
+This lack of communication is probably why the plateaus due to topology are not present.
+Looking next at the right column on Intel, Figures~\ref{fig:yield:jax:low:ops} and \ref{fig:yield:jax:low:ns} show the results for 1 \at for each \proc.
+As for @cycle@, \CFA's cost of idle sleep comes into play in a very significant way in Figure~\ref{fig:yield:jax:low:ns}, where the scaling is not flat.
+This result is to be expected since fewer \ats mean \procs are more likely to run out of work.
+On the other hand, when only running 1 \at per \proc, libfibre optimizes further and forgoes the context switch entirely.
+This results in libfibre outperforming other runtimes, even more, achieving 8 times more throughput than for @cycle@.
+Finally, Go and Tokio's performance collapse is still the same with fewer \ats.
+The only exception is Tokio running on 24 \procs, deepening the mystery of its yielding mechanism further.
 \begin{figure}
 …
                 \label{fig:yield:nasus:low:ns}
+        }
+        \caption[Yield Benchmark on AMD]{Yield Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, using 1 \ats per \proc. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
+        \caption[Yield Benchmark on AMD]{Yield Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count.
+        For throughput, higher is better, for scalability, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
         \label{fig:yield:nasus}
 \end{figure}
+For the AMD architecture, Figure~\ref{fig:yield:nasus}, the results show the same story as on the Intel, with slightly increased variations.
+Also, some transition points on the X-axis differ because of the architectures, like at 16 versus 24 processors.
+It is difficult to draw conclusions for this benchmark when runtime system treat @yield@ so differently.
+Looking now at the results for the AMD architecture, Figure~\ref{fig:yield:nasus}, the results again show a story that is overall similar to the results on the Intel, with increased variation and some differences in the details.
+Note that the maximum of the Y-axis on Intel and AMD differ less in @yield@ than @cycle@.
+Looking at the left column first, Figures~\ref{fig:yield:nasus:ops} and \ref{fig:yield:nasus:ns}, \CFA achieves very similar throughput and scaling.
+Libfibre still outpaces all other runtimes, but it encounters a performance hit at 64 \procs.
+This anomaly suggests some amount of communication between the \procs that the Intel machine is able to mask where the AMD is not once hyperthreading is needed.
+Go and Tokio still display the same performance collapse as on Intel.
+Looking next at the right column on AMD, Figures~\ref{fig:yield:nasus:low:ops} and \ref{fig:yield:nasus:low:ns}, all runtime systems effectively behave the same as they did on the Intel machine.
+At the high \ats count, the only difference is Libfibre's scaling and this difference disappears on the right column.
+This behaviour suggests whatever communication issue it encountered on the left is completely circumvented on the right.
+It is difficult to draw conclusions for this benchmark when runtime systems treat @yield@ so differently.
 The win for \CFA is its consistency between the cycle and yield benchmarks making it simpler for programmers to use and understand, \ie the \CFA semantics match with programmer intuition.
 …
 \section{Churn}
 The Cycle and Yield benchmark represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application.
 In these benchmarks, \ats can be easily partitioned over the different \procs upfront and none of the \ats communicate with each other.
 The Churn benchmark represents more chaotic executions, where there is more communication among \ats but no relationship between the last \proc on which a \at ran and blocked and the \proc that subsequently unblocks it.
+The Cycle and Yield benchmarks represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application.
+In these benchmarks \ats can be easily partitioned over the different \procs upfront and none of the \ats communicate with each other.
+The Churn benchmark represents more chaotic executions, where there is more communication among \ats but no relationship between the last \proc on which a \at ran and blocked, and the \proc that subsequently unblocks it.
 With processor-specific ready-queues, when a \at is unblocked by a different \proc that means the unblocking \proc must either ``steal'' the \at from another processor or find it on a remote queue.
 This dequeuing results in either contention on the remote queue and/or \glspl{rmr} on the \at data structure.
 Hence, this benchmark has performance dominated by the cache traffic as \proc are constantly accessing the each other's data.
 In either case, this benchmark aims to measure how well a scheduler handles these cases, since both cases can lead to performance degradation if not handled correctly.
+Hence, this benchmark has performance dominated by the cache traffic as \procs are constantly accessing each other's data.
+In either case, this benchmark aims to measure how well a scheduler handles these cases since both cases can lead to performance degradation if not handled correctly.
 This benchmark uses a fixed-size array of counting semaphores.
 Each \at picks a random semaphore, @V@s it to unblock any waiting \at, and then @P@s (maybe blocks) the \ats on the semaphore.
+Each \at picks a random semaphore, @V@s it to unblock any waiting \at, and then @P@s (maybe blocks) the \at on the semaphore.
 This creates a flow where \ats push each other out of the semaphores before being pushed out themselves.
 For this benchmark to work, the number of \ats must be equal or greater than the number of semaphores plus the number of \procs;
 \eg if there are 10 semaphores and 5 \procs, but only 3 \ats, all 3 \ats can block (P) on a random semaphore and now there is no \ats to unblock (V) them.
 Note, the nature of these semaphores mean the counter can go beyond 1, which can lead to nonblocking calls to @P@.
+For this benchmark to work, the number of \ats must be equal to or greater than the number of semaphores plus the number of \procs;
+\eg if there are 10 semaphores and 5 \procs, but only 3 \ats, all 3 \ats can block (P) on a random semaphore and now there are no \ats to unblock (V) them.
+Note that the nature of these semaphores means the counter can go beyond 1, which can lead to nonblocking calls to @P@.
 Figure~\ref{fig:churn:code} shows pseudo code for this benchmark, where the @yield@ is replaced by @V@ and @P@.
 …
+}
 \end{cfa}
 \caption[Churn Benchmark : Pseudo Code]{Churn Benchmark : Pseudo Code}
+\caption[Churn Benchmark: Pseudo Code]{Churn Benchmark: Pseudo Code}
 \label{fig:churn:code}
 %\end{figure}
 …
+        }
         \subfloat[][Latency, 100 \ats per \proc]{
+        \subfloat[][Scalability, 100 \ats per \proc]{
                 \resizebox{0.5\linewidth}{!}{
                         \input{result.churn.jax.ns.pstex_t}
 …
                 \label{fig:churn:jax:ns}
+        }
         \subfloat[][Latency, 2 \ats per \proc]{
+        \subfloat[][Scalability, 2 \ats per \proc]{
                 \resizebox{0.5\linewidth}{!}{
                         \input{result.churn.low.jax.ns.pstex_t}
 …
                 \label{fig:churn:jax:low:ns}
+        }
+        \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
+        \caption[Churn Benchmark on Intel]{Churn Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count.
+        For throughput, higher is better, for scalability, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
         \label{fig:churn:jax}
 \end{figure}
 …
 \subsection{Results}
+Figures~\ref{fig:churn:jax} and Figure~\ref{fig:churn:nasus} show the throughput on Intel and AMD respectively.
+The performance cost of crossing the cache boundaries is still visible at the same \proc count.
+Scalability is notably worst than the previous benchmarks since there is inherently more communication between processors.
+Indeed, once the number of \glspl{hthrd} goes beyond a single socket, performance ceases to improve.
+Figures~\ref{fig:churn:jax} and Figure~\ref{fig:churn:nasus} show the results for the churn experiment on Intel and AMD, respectively.
+Looking at the left column on Intel, Figures~\ref{fig:churn:jax:ops} and \ref{fig:churn:jax:ns} show the results for 100 \ats for each \proc, and all runtimes obtain fairly similar throughput for most \proc counts.
+\CFA does very well on a single \proc but quickly loses its advantage over the other runtimes.
+As expected, it scales decently up to 48 \procs, drops from 48 to 72 \procs, and then plateaus.
+Tokio achieves very similar performance to \CFA, with the starting boost, scaling decently until 48 \procs, drops from 48 to 72 \procs, and starts increasing again to 192 \procs.
+Libfibre obtains effectively the same results as Tokio with slightly less scaling, \ie the scaling curve is the same but with slightly lower values.
+Finally, Go gets the most peculiar results, scaling worst than other runtimes until 48 \procs.
+At 72 \procs, the results of the Go runtime vary significantly, sometimes scaling sometimes plateauing.
+However, beyond this point Go keeps this level of variation but does not scale further in any of the runs.
+Throughput and scalability are notably worst for all runtimes than the previous benchmarks since there is inherently more communication between processors.
+Indeed, none of the runtimes reach 40 million operations per second while in the cycle benchmark all but libfibre reached 400 million operations per second.
+Figures~\ref{fig:churn:jax:ns} and \ref{fig:churn:jax:low:ns} show that for all \proc counts, all runtimes produce poor scaling.
+However, once the number of \glspl{hthrd} goes beyond a single socket, at 48 \procs, scaling goes from bad to worst and performance completely ceases to improve.
+At this point, the benchmark is dominated by inter-socket communication costs for all runtimes.
 An interesting aspect to note here is that the runtimes differ in how they handle this situation.
+Indeed, when a \proc unparks a \at that was last run on a different \proc, the \at could be appended to the ready-queue local \proc or to the ready-queue of the remote \proc, which previously ran the \at.
+\CFA, Tokio and Go all use the approach of unparking to the local \proc while Libfibre unparks to the remote \proc.
+In this particular benchmark, the inherent chaos of the benchmark in addition to small memory footprint means neither approach wins over the other.
+Indeed, when a \proc unparks a \at that was last run on a different \proc, the \at could be appended to the ready queue of the local \proc or to the ready queue of the remote \proc, which previously ran the \at.
+\CFA, Tokio and Go all use the approach of \glslink{atsched}{unparking} to the local \proc, while Libfibre unparks to the remote \proc.
+In this particular benchmark, the inherent chaos of the benchmark, in addition to the small memory footprint, means neither approach wins over the other.
+Looking next at the right column on Intel, Figures~\ref{fig:churn:jax:low:ops} and \ref{fig:churn:jax:low:ns} show the results for 1 \at for each \proc, and many of the differences between the runtimes disappear.
+\CFA outperforms other runtimes by a minuscule margin.
+Libfibre follows very closely behind with basically the same performance and scaling.
+Tokio maintains effectively the same curve shapes as \CFA and libfibre, but it incurs extra costs for all \proc counts.
+While Go maintains overall similar results to the others, it again encounters significant variation at high \proc counts.
+Inexplicably resulting in super-linear scaling for some runs, \ie the scalability curves display a negative slope.
+Interestingly, unlike the cycle benchmark, running with fewer \ats does not produce drastically different results.
+In fact, the overall throughput stays almost exactly the same on the left and right columns.
 \begin{figure}
 …
+        }
         \subfloat[][Latency, 100 \ats per \proc]{
+        \subfloat[][Scalability, 100 \ats per \proc]{
                 \resizebox{0.5\linewidth}{!}{
                         \input{result.churn.nasus.ns.pstex_t}
 …
                 \label{fig:churn:nasus:ns}
+        }
         \subfloat[][Latency, 2 \ats per \proc]{
+        \subfloat[][Scalability, 2 \ats per \proc]{
                 \resizebox{0.5\linewidth}{!}{
                         \input{result.churn.low.nasus.ns.pstex_t}
 …
                 \label{fig:churn:nasus:low:ns}
+        }
+        \caption[Churn Benchmark on AMD]{\centering Churn Benchmark on AMD\smallskip\newline Throughput and latency of the Churn on the benchmark on the AMD machine.
+        For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
+        \caption[Churn Benchmark on AMD]{Churn Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count.
+        For throughput, higher is better, for scalability, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
         \label{fig:churn:nasus}
 \end{figure}
+Like for the cycle benchmark, here all runtimes achieve fairly similar performance.
+Performance improves as long as all \procs fit on a single socket.
+Beyond that performance starts to suffer from increased caching costs.
+Indeed on Figures~\ref{fig:churn:jax:ops} and \ref{fig:churn:jax:ns} show that with 1 and 100 \ats per \proc, \CFA, libfibre, Go and Tokio achieve effectively equivalent performance for most \proc count.
+However, Figure~\ref{fig:churn:nasus} again shows a somewhat different story on AMD.
+While \CFA, libfibre, and Tokio achieve effectively equivalent performance for most \proc count, Go starts with better scaling at very low \proc counts but then performance quickly plateaus, resulting in worse performance at higher \proc counts.
+This performance difference is visible at both high and low \at counts.
+One possible explanation for this difference is that since Go has very few available concurrent primitives, a channel was used instead of a semaphore.
+On paper a semaphore can be replaced by a channel and with zero-sized objects passed along equivalent performance could be expected.
+However, in practice there can be implementation difference between the two.
+This is especially true if the semaphore count can get somewhat high.
+Note that this replacement is also made in the cycle benchmark, however in that context it did not seem to have a notable impact.
+As second possible explanation is that Go may sometimes use the heap when allocating variables based on the result of escape analysis of the code.
+It is possible that variables that should be placed on the stack are placed on the heap.
+This could cause extra pointer chasing in the benchmark, heightening locality effects.
+Depending on how the heap is structure, this could also lead to false sharing.
+The objective of this benchmark is to demonstrate that unparking \ats from remote \procs do not cause too much contention on the local queues.
+Indeed, the fact all runtimes achieve some scaling at lower \proc count demonstrate that migrations do not need to be serialized.
+Again these result demonstrate \CFA achieves satisfactory performance.
+Looking now at the results for the AMD architecture, Figure~\ref{fig:churn:nasus}, the results show a somewhat different story.
+Looking at the left column first, Figures~\ref{fig:churn:nasus:ops} and \ref{fig:churn:nasus:ns}, \CFA, Libfibre and Tokio all produce decent scalability.
+\CFA suffers particularly from larger variations at higher \proc counts, but largely outperforms the other runtimes.
+Go still produces intriguing results in this case and even more intriguingly, the results have fairly low variation.
+One possible explanation for Go's difference is that it has very few available concurrent primitives, so a channel is substituted for a semaphore.
+On paper, a semaphore can be replaced by a channel, and with zero-sized objects passed through the channel, equivalent performance could be expected.
+However, in practice, there are implementation differences between the two, \eg if the semaphore count can get somewhat high so objects accumulate in the channel.
+Note that this substitution is also made in the cycle benchmark;
+however, in that context, it did not have a notable impact.
+A second possible explanation is that Go may use the heap when allocating variables based on the result of the escape analysis of the code.
+It is possible for variables that could be placed on the stack to instead be placed on the heap.
+This placement could cause extra pointer chasing in the benchmark, heightening locality effects.
+Depending on how the heap is structured, this could also lead to false sharing.
+I did not investigate what causes these unusual results.
+Looking next at the right column, Figures~\ref{fig:churn:nasus:low:ops} and \ref{fig:churn:nasus:low:ns}, as for Intel, all runtimes obtain overall similar throughput between the left and right column.
+\CFA, Libfibre and Tokio all have very close results.
+Go still suffers from poor scalability but is now unusual in a different way.
+While it obtains effectively constant performance regardless of \proc count, this ``sequential'' performance is higher than the other runtimes for low \proc count.
+Up to 32 \procs, after which the other runtimes manage to outscale Go.
+In conclusion, the objective of this benchmark is to demonstrate that \glslink{atsched}{unparking} \ats from remote \procs does not cause too much contention on the local queues.
+Indeed, the fact that most runtimes achieve some scaling between various \proc counts demonstrates migrations do not need to be serialized.
+Again these results demonstrate that \CFA achieves satisfactory performance compared to the other runtimes.
 \section{Locality}
+As mentioned in the churn benchmark, when \glslink{atsched}{unparking} a \at, it is possible to either \unpark to the local or remote ready-queue.\footnote{
+It is also possible to \unpark to a third unrelated ready-queue, but without additional knowledge about the situation, it is likely to degrade performance.}
+The locality experiment includes two variations of the churn benchmark, where a data array is added.
+In both variations, before @V@ing the semaphore, each \at calls a @work@ function which increments random cells inside the data array.
+In the noshare variation, the array is not passed on and each thread continuously accesses its private array.
+In the share variation, the array is passed to another thread via the semaphore's shadow queue (each blocking thread can save a word of user data in its blocking node), transferring ownership of the array to the woken thread.
+Figure~\ref{fig:locality:code} shows the pseudo code for this benchmark.
+The objective here is to highlight the different decisions made by the runtime when \glslink{atsched}{unparking}.
+Since each thread unparks a random semaphore, it means that it is unlikely that a \at is unparked from the last \proc it ran on.
+In the noshare variation, \glslink{atsched}{unparking} the \at on the local \proc is an appropriate choice since the data was last modified on that \proc.
+In the shared variation, \glslink{atsched}{unparking} the \at on a remote \proc is an appropriate choice.
+The expectation for this benchmark is to see a performance inversion, where runtimes fare notably better in the variation which matches their \glslink{atsched}{unparking} policy.
+This decision should lead to \CFA, Go and Tokio achieving better performance in the share variation while libfibre achieves better performance in noshare.
+Indeed, \CFA, Go and Tokio have the default policy of \glslink{atsched}{unparking} \ats on the local \proc, whereas libfibre has the default policy of \glslink{atsched}{unparking} \ats wherever they last ran.
 \begin{figure}
 …
 \end{lrbox}
 \subfloat[Thread$_1$]{\label{f:CFibonacci}\usebox\myboxA}
+\subfloat[Noshare]{\label{fig:locality:code:T1}\usebox\myboxA}
 \hspace{3pt}
 \vrule
 \hspace{3pt}
 \subfloat[Thread$_2$]{\label{f:CFAFibonacciGen}\usebox\myboxB}
 \caption[Locality Benchmark : Pseudo Code]{Locality Benchmark : Pseudo Code}
+\subfloat[Share]{\label{fig:locality:code:T2}\usebox\myboxB}
+\caption[Locality Benchmark: Pseudo Code]{Locality Benchmark: Pseudo Code}
 \label{fig:locality:code}
 \end{figure}
-As mentioned in the churn benchmark, when unparking a \at, it is possible to either unpark to the local or remote ready-queue.
-\footnote{It is also possible to unpark to a third unrelated ready-queue, but without additional knowledge about the situation, there is little to suggest this would not degrade performance.}
-The locality experiment includes two variations of the churn benchmark, where an array of data is added.
-In both variations, before @V@ing the semaphore, each \at increment random cells inside the array.
-The @share@ variation then passes the array to the shadow-queue of the semaphore, transferring ownership of the array to the woken thread.
-In the @noshare@ variation the array is not passed on and each thread continuously accesses its private array.
-The objective here is to highlight the different decision made by the runtime when unparking.
-Since each thread unparks a random semaphore, it means that it is unlikely that a \at will be unparked from the last \proc it ran on.
-In the @share@ version, this means that unparking the \at on the local \proc is appropriate since the data was last modified on that \proc.
-In the @noshare@ version, the unparking the \at on the remote \proc is the appropriate approach.
-The expectation for this benchmark is to see a performance inversion, where runtimes will fare notably better in the variation which matches their unparking policy.
-This should lead to \CFA, Go and Tokio achieving better performance in @share@ while libfibre achieves better performance in @noshare@.
-Indeed, \CFA, Go and Tokio have the default policy of unpark \ats on the local \proc, where as libfibre has the default policy of unparks \ats wherever they last ran.
 \subsection{Results}
+Figures~\ref{fig:locality:jax} and \ref{fig:locality:nasus} show the results for the locality experiment on Intel and AMD, respectively.
+In both cases, the graphs on the left column show the results for the share variation and the graphs on the right column show the results for the noshare.
+Looking at the left column on Intel, Figures~\ref{fig:locality:jax:share:ops} and \ref{fig:locality:jax:share:ns} show the results for the share variation.
+\CFA and Tokio slightly outperform libfibre, as expected, based on their \ats placement approach.
+\CFA and Tokio both \unpark locally and do not suffer cache misses on the transferred array.
+Libfibre, on the other hand, unparks remotely, and as such the unparked \at is likely to miss on the shared data.
+Go trails behind in this experiment, presumably for the same reasons that were observable in the churn benchmark.
+Otherwise, the results are similar to the churn benchmark, with lower throughput due to the array processing.
+As for most previous results, all runtimes suffer a performance hit after 48 \procs, which is the socket boundary, and climb again from 96 to 192 \procs.
 \begin{figure}
         \subfloat[][Throughput share]{
 …
                 \label{fig:locality:jax:noshare:ns}
+        }
+        \caption[Locality Benchmark on Intel]{Locality Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
+        \caption[Locality Benchmark on Intel]{Locality Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count.
+        For throughput, higher is better, for scalability, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
         \label{fig:locality:jax}
 \end{figure}
 \begin{figure}
         \subfloat[][Throughput share]{
 …
                 \label{fig:locality:nasus:noshare:ns}
+        }
+        \caption[Locality Benchmark on AMD]{Locality Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.}
+        \caption[Locality Benchmark on AMD]{Locality Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count.
+        For throughput, higher is better, for scalability, lower is better.
+        Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.}
         \label{fig:locality:nasus}
 \end{figure}
+Figures~\ref{fig:locality:jax} and \ref{fig:locality:nasus} shows the results on Intel and AMD respectively.
+In both cases, the graphs on the left column show the results for the @share@ variation and the graphs on the right column show the results for the @noshare@.
+On Intel, Figure~\ref{fig:locality:jax} shows Go trailing behind the 3 other runtimes.
+On the left of the figure showing the results for the shared variation, where \CFA and Tokio slightly outperform libfibre as expected.
+And correspondingly on the right, we see the expected performance inversion where libfibre now outperforms \CFA and Tokio.
+Otherwise the results are similar to the churn benchmark, with lower throughput due to the array processing.
+Presumably the reason why Go trails behind are the same as in Figure~\ref{fig:churn:nasus}.
+Figure~\ref{fig:locality:nasus} shows the same experiment on AMD.
+\todo{why is cfa slower?}
+Again, we see the same story, where Tokio and libfibre swap places and Go trails behind.
+Looking at the right column on Intel, Figures~\ref{fig:locality:jax:noshare:ops} and \ref{fig:locality:jax:noshare:ns} show the results for the noshare variation.
+The graphs show the expected performance inversion where libfibre now outperforms \CFA and Tokio.
+Indeed, in this case, unparking remotely means the unparked \at is less likely to suffer a cache miss on the array, which leaves the \at data structure and the remote queue as the only source of likely cache misses.
+Results show both are amortized fairly well in this case.
+\CFA and Tokio both \unpark locally and as a result, suffer a marginal performance degradation from the cache miss on the array.
+Looking at the results for the AMD architecture, Figure~\ref{fig:locality:nasus}, shows results similar to the Intel.
+Again the overall performance is higher and slightly more variation is visible.
+Looking at the left column first, Figures~\ref{fig:locality:nasus:share:ops} and \ref{fig:locality:nasus:share:ns}, \CFA and Tokio still outperform libfibre, this time more significantly.
+This advantage is expected from the AMD server with its smaller and narrower caches that magnify the costs of processing the array.
+Go still has the same poor performance as on Intel.
+Finally looking at the right column, Figures~\ref{fig:locality:nasus:noshare:ops} and \ref{fig:locality:nasus:noshare:ns}, like on Intel, the same performance inversion is present between libfibre and \CFA/Tokio.
+Go still has the same poor performance.
+Overall, this benchmark mostly demonstrates the two options available when \glslink{atsched}{unparking} a \at.
+Depending on the workload, either of these options can be the appropriate one.
+Since it is prohibitively difficult to dynamically detect which approach is appropriate, all runtimes much choose one of the two and live with the consequences.
+Once again, these experiments demonstrate that \CFA achieves equivalent performance to the other runtimes, in this case matching the faster Tokio rather than Go, which is trailing behind.
 \section{Transfer}
 The last benchmark is more of an experiment than a benchmark.
 It tests the behaviour of the schedulers for a misbehaved workload.
 In this workload, one of the \at is selected at random to be the leader.
+In this workload, one \at is selected at random to be the leader.
 The leader then spins in a tight loop until it has observed that all other \ats have acknowledged its leadership.
 The leader \at then picks a new \at to be the next leader and the cycle repeats.
 The benchmark comes in two flavours for the non-leader \ats:
+The benchmark comes in two variations for the non-leader \ats:
 once they acknowledged the leader, they either block on a semaphore or spin yielding.
+The experiment is designed to evaluate the short-term load-balancing of a scheduler.
+Indeed, schedulers where the runnable \ats are partitioned on the \procs may need to balance the \ats for this experiment to terminate.
+This problem occurs because the spinning \at is effectively preventing the \proc from running any other \at.
+In the semaphore flavour, the number of runnable \ats eventually dwindles down to only the leader.
+This scenario is a simpler case to handle for schedulers since \procs eventually run out of work.
+In the yielding flavour, the number of runnable \ats stays constant.
+This scenario is a harder case to handle because corrective measures must be taken even when work is available.
+Note, runtime systems with preemption circumvent this problem by forcing the spinner to yield.
+In both flavours, the experiment effectively measures how long it takes for all \ats to run once after a given synchronization point.
+In an ideal scenario where the scheduler is strictly FIFO, every thread would run once after the synchronization and therefore the delay between leaders would be given by:
+$ \frac{CSL + SL}{NP - 1}$, where $CSL$ is the context switch latency, $SL$ is the cost for enqueueing and dequeuing a \at and $NP$ is the number of \procs.
+However, if the scheduler allows \ats to run many times before other \ats are able to run once, this delay will increase.
+The semaphore version is an approximation of the strictly FIFO scheduling, where none of the \ats \emph{attempt} to run more than once.
+The benchmark effectively provides the fairness guarantee in this case.
+In the yielding version however, the benchmark provides no such guarantee, which means the scheduler has full responsibility and any unfairness will be measurable.
+While this is a fairly artificial scenario, it requires only a few simple pieces.
+The yielding version of this simply creates a scenario where a \at runs uninterrupted in a saturated system, and starvation has an easily measured impact.
+However, \emph{any} \at that runs uninterrupted for a significant period of time in a saturated system could lead to this kind of starvation.
+Figure~\ref{fig:transfer:code} shows pseudo code for this benchmark.
 \begin{figure}
 …
         // pick next leader
         leader := threads[ prng() % len(threads) ]
         // wake every one
+        // wake everyone
         if ! exhaust {
                 for t in threads {
 …
+}
 \end{cfa}
 \caption[Transfer Benchmark : Pseudo Code]{Transfer Benchmark : Pseudo Code}
+\caption[Transfer Benchmark: Pseudo Code]{Transfer Benchmark: Pseudo Code}
 \label{fig:transfer:code}
 \end{figure}
+The experiment is designed to evaluate the short-term load balancing of a scheduler.
+Indeed, schedulers where the runnable \ats are partitioned on the \procs may need to balance the \ats for this experiment to terminate.
+This problem occurs because the spinning \at is effectively preventing the \proc from running any other \at.
+In the semaphore variation, the number of runnable \ats eventually dwindles to only the leader.
+This scenario is a simpler case to handle for schedulers since \procs eventually run out of work.
+In the yielding variation, the number of runnable \ats stays constant.
+This scenario is a harder case to handle because corrective measures must be taken even when work is available.
+Note that runtimes with preemption circumvent this problem by forcing the spinner to yield.
+In \CFA preemption was disabled as it only obfuscates the results.
+I am not aware of a method to disable preemption in Go.
+In both variations, the experiment effectively measures how long it takes for all \ats to run once after a given synchronization point.
+In an ideal scenario where the scheduler is strictly FIFO, every thread would run once after the synchronization and therefore the delay between leaders would be given by, $(CSL + SL) / (NP - 1)$,
+where $CSL$ is the context-switch latency, $SL$ is the cost for enqueueing and dequeuing a \at, and $NP$ is the number of \procs.
+However, if the scheduler allows \ats to run many times before other \ats can run once, this delay increases.
+The semaphore version is an approximation of strictly FIFO scheduling, where none of the \ats \emph{attempt} to run more than once.
+The benchmark effectively provides the fairness guarantee in this case.
+In the yielding version, however, the benchmark provides no such guarantee, which means the scheduler has full responsibility and any unfairness is measurable.
+While this is an artificial scenario, in real life it requires only a few simple pieces.
+The yielding version simply creates a scenario where a \at runs uninterrupted in a saturated system and the starvation has an easily measured impact.
+Hence, \emph{any} \at that runs uninterrupted for a significant time in a saturated system could lead to this kind of starvation.
 \subsection{Results}
+\begin{figure}
+\begin{table}
+\caption[Transfer Benchmark on Intel and AMD]{Transfer Benchmark on Intel and AMD\smallskip\newline Average measurement of how long it takes for all \ats to acknowledge the leader \at.
+DNC stands for ``did not complete'', meaning that after 5 seconds of a new leader being decided, some \ats still had not acknowledged the new leader.}
+\label{fig:transfer:res}
+\setlength{\extrarowheight}{2pt}
+\setlength{\tabcolsep}{5pt}
 \begin{centering}
+\begin{tabular}{r | c c c c | c c c c }
+Machine   &                     \multicolumn{4}{c |}{Intel}                &          \multicolumn{4}{c}{AMD}                    \\
+Variation & \multicolumn{2}{c}{Park} & \multicolumn{2}{c |}{Yield} & \multicolumn{2}{c}{Park} & \multicolumn{2}{c}{Yield} \\
+\begin{tabular}{r | c | c | c | c | c | c | c | c}
+Machine   &                     \multicolumn{4}{c |}{Intel}                &          \multicolumn{4}{c}{AMD}             \\
+\cline{2-9}
+Variation & \multicolumn{2}{c|}{Park} & \multicolumn{2}{c |}{Yield} & \multicolumn{2}{c|}{Park} & \multicolumn{2}{c}{Yield} \\
+\cline{2-9}
 \procs    &      2      &      192   &      2      &      192      &      2      &      256   &      2      &      256    \\
 \hline
 …
 \end{tabular}
 \end{centering}
+\caption[Transfer Benchmark on Intel and AMD]{Transfer Benchmark on Intel and AMD\smallskip\newline Average measurement of how long it takes for all \ats to acknowledge the leader \at. DNC stands for ``did not complete'', meaning that after 5 seconds of a new leader being decided, some \ats still had not acknowledged the new leader.}
+\label{fig:transfer:res}
+\end{figure}
+Figure~\ref{fig:transfer:res} shows the result for the transfer benchmark with 2 \procs and all \procs, where each experiment runs 100 \at per \proc.
+\end{table}
+Table~\ref{fig:transfer:res} shows the result for the transfer benchmark with 2 \procs and all \procs on the computer, where each experiment runs 100 \ats per \proc.
 Note that the results here are only meaningful as a coarse measurement of fairness, beyond which small cost differences in the runtime and concurrent primitives begin to matter.
 As such, data points that are the on the same order of magnitude as each other should be basically considered equal.
 The takeaway of this experiment is the presence of very large differences.
 The semaphore variation is denoted ``Park'', where the number of \ats dwindles down as the new leader is acknowledged.
+As such, data points within the same order of magnitude are considered equal.
+That is, the takeaway of this experiment is the presence of very large differences.
+The semaphore variation is denoted ``Park'', where the number of \ats dwindles as the new leader is acknowledged.
 The yielding variation is denoted ``Yield''.
+The experiment was only run for the extremes of the number of cores since the scaling per core behaves like previous experiments.
+This experiments clearly demonstrate that while the other runtimes achieve similar performance in previous benchmarks, here \CFA achieves significantly better fairness.
+The semaphore variation serves as a control group, where all runtimes are expected to transfer leadership fairly quickly.
+Since \ats block after acknowledging the leader, this experiment effectively measures how quickly \procs can steal \ats from the \proc running leader.
+Figure~\ref{fig:transfer:res} shows that while Go and Tokio are slower, all runtime achieve decent latency.
+The experiment is only run for a few and many \procs since scaling is not the focus of this experiment.
+The first two columns show the results for the semaphore variation on Intel.
+While there are some differences in latencies, \CFA is consistently the fastest and Tokio the slowest, all runtimes achieve fairly close results.
+Again, this experiment is meant to highlight major differences so latencies within $10\times$ of each other are considered equal.
+Looking at the next two columns, the results for the yield variation on Intel, the story is very different.
+\CFA achieves better latencies, presumably due to no synchronization with the yield.
+Go does complete the experiment, but with drastically higher latency:
+latency at 2 \procs is $350\times$ higher than \CFA and $70\times$ higher at 192 \procs.
+This difference is because Go has a classic work-stealing scheduler, but it adds coarse-grain preemption
+, which interrupts the spinning leader after a period.
+Neither Libfibre nor Tokio complete the experiment.
+Both runtimes also use classical work-stealing scheduling without preemption, and therefore, none of the work queues are ever emptied so no load balancing occurs.
+Looking now at the results for the AMD architecture, the results show effectively the same story.
+The first two columns show all runtime obtaining results well within $10\times$ of each other.
+The next two columns again show \CFA producing low latencies, while Go still has notably higher latency but the difference is less drastic on 2 \procs, where it produces a $15\times$ difference as opposed to a $100\times$ difference on 256 \procs.
+Neither Libfibre nor Tokio complete the experiment.
+This experiment clearly demonstrates that \CFA achieves significantly better fairness.
+The semaphore variation serves as a control, where all runtimes are expected to transfer leadership fairly quickly.
+Since \ats block after acknowledging the leader, this experiment effectively measures how quickly \procs can steal \ats from the \proc running the leader.
+Table~\ref{fig:transfer:res} shows that while Go and Tokio are slower using the semaphore, all runtimes achieve decent latency.
 However, the yielding variation shows an entirely different picture.
 Since libfibre and Tokio have a traditional work-stealing scheduler, \procs that have \ats on their local queues will never steal from other \procs.
 The result is that the experiment simply does not complete for these runtime.
 Without \procs stealing from the \proc running the leader, the experiment will simply never terminate.
+Since libfibre and Tokio have a traditional work-stealing scheduler, \procs that have \ats on their local queues never steal from other \procs.
+The result is that the experiment simply does not complete for these runtimes.
+Without \procs stealing from the \proc running the leader, the experiment cannot terminate.
 Go manages to complete the experiment because it adds preemption on top of classic work-stealing.
 However, since preemption is fairly costly it achieves significantly worst performance.
+However, since preemption is fairly infrequent, it achieves significantly worst performance.
 In contrast, \CFA achieves equivalent performance in both variations, demonstrating very good fairness.
 Interestingly \CFA achieves better delays in the yielding version than the semaphore version, however, that is likely due to fairness being equivalent but removing the cost of the semaphores and idle-sleep.
+Interestingly \CFA achieves better delays in the yielding version than the semaphore version, however, that is likely due to fairness being equivalent but removing the cost of the semaphores and idle sleep.

doc/theses/thierry_delisle_PhD/thesis/text/existing.tex

-              rebf8ca5
+              r23a08aa0
 In general, \emph{selecting} a scheduling algorithm depends on how much information is available to the scheduler.
 Workloads that are well-known, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms.
+Workloads that are well known, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms.
 A secondary aspect is how much information can be gathered versus how much information must be given as part of the scheduler input.
 This information adds to the spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limited information.
 Note, this description includes both information about each requests, \eg time to complete or resources needed, and information about the relationships among request, \eg whether or not some request must be completed before another request starts.
 Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling, since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time.
+Note, this description includes both information about each request, \eg time to complete or resources needed, and information about the relationships among requests, \eg whether some requests must be completed before another request starts.
+Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods.
 When a faster pace is needed and changes are much more frequent gathering information on workloads, up-front or live, can become much more limiting and more general schedulers are needed.
 \section{Naming Convention}
 Scheduling has been studied by various communities concentrating on different incarnation of the same problems.
 As a result, there are no standard naming conventions for scheduling that is respected across these communities.
+Scheduling has been studied by various communities concentrating on different incarnations of the same problems.
+As a result, there are no standard naming conventions for scheduling that are respected across these communities.
 This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats.
 \section{Static Scheduling}
 \newterm{Static schedulers} require \ats dependencies and costs be explicitly and exhaustively specified prior to scheduling.
+\newterm{Static schedulers} require \ats dependencies and costs to be explicitly and exhaustively specified prior to scheduling.
 The scheduler then processes this input ahead of time and produces a \newterm{schedule} the system follows during execution.
 This approach is popular in real-time systems since the need for strong guarantees justifies the cost of determining and supplying this information.
 In general, static schedulers are less relevant to this project because they require input from the programmers that the programming language does not have as part of its concurrency semantic.
+In general, static schedulers are less relevant to this project because they require input from the programmers that the \CFA programming language does not have as part of its concurrency semantics.
 Specifying this information explicitly adds a significant burden to the programmer and reduces flexibility.
 For this reason, the \CFA scheduler does not require this information.
 \section{Dynamic Scheduling}
 \newterm{Dynamic schedulers} determine \ats dependencies and costs during scheduling, if at all.
 Hence, unlike static scheduling, \ats dependencies are conditional and detected at runtime.
 This detection takes the form of observing new \ats(s) in the system and determining dependencies from their behaviour, including suspending or halting a \ats that dynamically detects unfulfilled dependencies.
 Furthermore, each \ats has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled.
+\newterm{Dynamic schedulers} determine \at dependencies and costs during scheduling, if at all.
+Hence, unlike static scheduling, \at dependencies are conditional and detected at runtime.
+This detection takes the form of observing new \ats in the system and determining dependencies from their behaviour, including suspending or halting a \at that dynamically detects unfulfilled dependencies.
+Furthermore, each \at has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled.
 As a consequence, the scheduler often has an incomplete view of the system, seeing only \ats with no pending dependencies.
 \subsection{Explicitly Informed Dynamic Schedulers}
+While dynamic schedulers may not have an exhaustive list of dependencies for a \ats, some information may be available about each \ats, \eg expected duration, required resources, relative importance, \etc.
+When available, a scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information}
+However, most programmers do not determine or even \emph{predict} this information;
+at best, the scheduler has only some imprecise information provided by the programmer, \eg, indicating a \ats takes approximately 3--7 seconds to complete, rather than exactly 5 seconds.
+Providing this kind of information is a significant programmer burden especially if the information does not scale with the number of \ats and their complexity.
+For example, providing an exhaustive list of files read by 5 \ats is an easier requirement then providing an exhaustive list of memory addresses accessed by 10,000 independent \ats.
+While dynamic schedulers may not have an exhaustive list of dependencies for a \at, some information may be available about each \at, \eg expected duration, required resources, relative importance, \etc.
+When available, a scheduler can then use this information to direct the scheduling decisions.
+For example, when scheduling in a cloud computing context, \ats will commonly have extra information that was manually entered, \eg caps on compute time or \io usage.
+However, in the context of user-level threading, most programmers do not determine or even \emph{predict} this information;
+at best, the scheduler has only some imprecise information provided by the programmer, \eg, indicating a \at takes approximately 3--7 seconds to complete, rather than exactly 5 seconds.
+Providing this kind of information is a significant programmer burden, especially if the information does not scale with the number of \ats and their complexity.
+For example, providing an exhaustive list of files read by 5 \ats is an easier requirement than providing an exhaustive list of memory addresses accessed by 10,000 independent \ats.
 Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, explicitly informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentioning.
 …
 \subsubsection{Priority Scheduling}
 Common information used by schedulers to direct their algorithm is priorities.
 Each \ats is given a priority and higher-priority \ats are preferred to lower-priority ones.
 The simplest priority scheduling algorithm is to require that every \ats have a distinct pre-established priority and always run the available \ats with the highest priority.
+Each \at is given a priority, and higher-priority \ats are preferred to lower-priority ones.
+The simplest priority scheduling algorithm is to require that every \at have a distinct pre-established priority and always run the available \ats with the highest priority.
 Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \ats.
 It can therefore be desirable for schedulers to support \ats with identical priorities and/or automatically setting and adjusting priorities for \ats.
+It can therefore be desirable for schedulers to support \ats with identical priorities and/or automatically set and adjust priorities for \ats.
 Most common operating systems use some variant on priorities with overlaps and dynamic priority adjustments.
 For example, Microsoft Windows uses a pair of priorities~\cite{win:priority}, one specified by users out of ten possible options and one adjusted by the system.
 \subsection{Uninformed and Self-Informed Dynamic Schedulers}
 Several scheduling algorithms do not require programmers to provide additional information on each \ats, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
+Several scheduling algorithms do not require programmers to provide additional information on each \at, and instead, make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
 \subsubsection{Feedback Scheduling}
 As mentioned, schedulers may also gather information about each \ats to direct their decisions.
+As mentioned, schedulers may also gather information about each \at to direct their decisions.
 This design effectively moves the scheduler into the realm of \newterm{Control Theory}~\cite{wiki:controltheory}.
 This information gathering does not generally involve programmers, and as such, does not increase programmer burden the same way explicitly provided information may.
 However, some feedback schedulers do allow programmers to offer additional information on certain \ats, in order to direct scheduling decisions.
 The important distinction being whether or not the scheduler can function without this additional information.
+However, some feedback schedulers do allow programmers to offer additional information on certain \ats, to direct scheduling decisions.
+The important distinction is whether the scheduler can function without this additional information.
 \section{Work Stealing}\label{existing:workstealing}
 One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work stealing.
 This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \ats first, but allows the possibility for other workers to steal local \ats if they run out of \ats.
 \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has a queue of \ats and workers without \ats steal \ats from random workers\footnote{The Burton and Sleep algorithm had trees of \ats and steal only among neighbours.}.
 Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations.
+One of the most popular scheduling algorithms in practice (see~\ref{existing:prod}) is work stealing.
+This idea, introduced by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \ats first but allows the possibility for other workers to steal local \ats if they run out of \ats.
+\cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each worker has a queue of \ats and workers without \ats steal \ats from random workers\footnote{The Burton and Sleep algorithm has trees of \ats and steals only among neighbours.}.
+Blumofe and Leiserson also prove worst-case space and time requirements for well-structured computations.
 Many variations of this algorithm have been proposed over the years~\cite{DBLP:journals/ijpp/YangH18}, both optimizations of existing implementations and approaches that account for new metrics.
 …
 In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead.
 The best performance generally means finding a middle ground between the two.
+Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained.
+\paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating \ats from one core to another can be .  \cite{DBLP:journals/tpds/SquillanteL93}
+\todo{The survey is not great on this subject}
+\paragraph{Complex Machine Architecture} Another aspect that has been examined is how well work stealing is applicable to different machine architectures.
+Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse-grained.
+\paragraph{Task Placement} Another aspect of work stealing that has been studied extensively is the mapping between \at and \proc.
+In its simplest form, work stealing assumes that all \procs are interchangeable and therefore the mapping between \at and \proc is not interesting.
+However, in real-life architectures there are contexts where different \procs can have different characteristics, which makes some mapping more interesting than others.
+A common example where this is statically true is architectures with \glsxtrshort{numa}.
+In these cases, it can be relevant to change the scheduler to be cognizant of the topology~\cite{vikranth2013topology,min2011hierarchical}.
+Another example is energy usage, where the scheduler is modified to optimize for energy efficiency in addition/instead of performance~\cite{ribic2014energy,torng2016asymmetry}.
+\paragraph{Complex Machine Architecture} Another aspect that has been examined is how applicable work stealing is to different machine architectures.
+This is arguably strongly related to Task Placement but extends into more heterogeneous architectures.
+As \CFA offers no particular support for heterogeneous architecture, this is also an area that is less relevant to this thesis.
+Although it could be an interesting avenue for future work.
 \subsection{Theoretical Results}
 There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}.
+There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of \glslink{atmig}{migration}~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}.
 \cite{DBLP:journals/jacm/BlellochGM99} examines the space bounds of work stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} shows that for under-loaded systems, the scheduler completes its computations in finite time, \ie is \newterm{stable}.
 Others show that work stealing is applicable to various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}.
+Others show that work stealing applies to various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}.
 \cite{DBLP:conf/ipps/ColeR13} also studied how randomized work-stealing affects false sharing among \ats.
 However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph.
 It is unclear how well these distributions represent workloads in real world scenarios.
+However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph.
+It is unclear how well these distributions represent workloads in real-world scenarios.
 \section{Preemption}
 One last aspect of scheduling is preemption since many schedulers rely on it for some of their guarantees.
 Preemption is the idea of interrupting \ats that have been running too long, effectively injecting suspend points into the application.
 There are multiple techniques to achieve this effect but they all aim to guarantee that the suspend points in a \ats are never further apart than some fixed duration.
 While this helps schedulers guarantee that no \ats unfairly monopolizes a worker, preemption can effectively be added to any scheduler.
 Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it.
+There are multiple techniques to achieve this effect, but they all aim to guarantee that the suspend points in a \at are never further apart than some fixed duration.
+While this helps schedulers guarantee that no \ats unfairly monopolize a worker, preemption can effectively be added to any scheduler.
+Therefore, the only interesting aspect of preemption for the design of scheduling is whether to require it.
 \section{Production Schedulers}\label{existing:prod}
 …
 While these schedulers do not necessarily represent the most recent advances in scheduling, they are what is generally accessible to programmers.
 As such, I believe these schedulers are at least as relevant as those presented in published work.
 Schedulers that operate in kernel space and in user space are considered, as both can offer relevant insight for this project.
+Both Schedulers that operate in kernel space and user space are considered, as both can offer relevant insight for this project.
 However, real-time schedulers are not considered, as these have constraints that are much stricter than what is needed for this project.
 \subsection{Operating System Schedulers}
 Operating System Schedulers tend to be fairly complex as they generally support some amount of real-time, aim to balance interactive and non-interactive \ats and support multiple users sharing hardware without requiring these users to cooperate.
+Operating System Schedulers tend to be fairly complex as they generally support some amount of real time, aim to balance interactive and non-interactive \ats and support multiple users sharing hardware without requiring these users to cooperate.
 Here are more details on a few schedulers used in the common operating systems: Linux, FreeBSD, Microsoft Windows and Apple's OS X.
 The information is less complete for operating systems with closed source.
+The information is less complete for closed source operating systems.
 \paragraph{Linux's CFS}
 The default scheduler used by Linux, the Completely Fair Scheduler~\cite{MAN:linux/cfs,MAN:linux/cfs2}, is a feedback scheduler based on CPU time.
 For each processor, it constructs a Red-Black tree of \ats waiting to run, ordering them by the amount of CPU time used.
 The \ats that has used the least CPU time is scheduled.
+The \at that has used the least CPU time is scheduled.
 It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time used.
 The ordering of \ats is also affected by a group based notion of fairness, where \ats belonging to groups having used less CPU time are preferred to \ats belonging to groups having used more CPU time.
 Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the load, currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}.
 \cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly.
 The issues highlighted stem from Linux's need to support fairness across \ats \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \ats and the other with one thousand \ats, the user with a single \ats does not receive one thousandth of the CPU time.}, increasing the complexity.
 Linux also offers a FIFO scheduler, a real-time scheduler, which runs the highest-priority \ats, and a round-robin scheduler, which is an extension of the FIFO-scheduler that adds fixed time slices. \cite{MAN:linux/sched}
+The ordering of \ats is also affected by a group-based notion of fairness, where \ats belonging to groups having used less CPU time are preferred to \ats belonging to groups having used more CPU time.
+Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the \gls{load}, currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}.
+\cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each \proc, but the paper argues this aspect can be improved significantly.
+The issues highlighted stem from Linux's need to support fairness across \ats \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \at and the other with one thousand \ats, the user with a single \at does not receive one-thousandth of the CPU time.}, increasing the complexity.
+Linux also offers a FIFO scheduler, a real-time scheduler, which runs the highest-priority \ats, and a round-robin scheduler, which is an extension of the FIFO scheduler that adds fixed time slices. \cite{MAN:linux/sched}
 \paragraph{FreeBSD}
 The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS.
 It uses different data structures and heuristics but also schedules according to some combination of CPU time used and niceness values.
 It also periodically balances the load of the system (according to a different heuristic), but uses a simpler work stealing approach.
+It also periodically balances the load of the system (according to a different heuristic) but uses a simpler work stealing approach.
 \paragraph{Windows(OS)}
 Microsoft's Operating System's Scheduler~\cite{MAN:windows/scheduler} is a feedback scheduler with priorities.
 It supports 32 levels of priorities, some of which are reserved for real-time and privileged applications.
 It schedules \ats based on the highest priorities (lowest number) and how much CPU time each \ats has used.
+It schedules \ats based on the highest priorities (lowest number) and how much CPU time each \at has used.
 The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests.
 In~\cite{russinovich2009windows}, Chapter 1 section ``Processes, Threads, and Jobs''\todo{Look up section number.} discusses the scheduling policy more in depth.
 Multicore scheduling is based on a combination of priorities and preferred \proc.
+In~\cite{russinovich2009windows}, Chapter 1 section 2.3 ``Processes, Threads, and Jobs'' discusses the scheduling policy more in-depth.
+Multicore scheduling is based on a combination of priorities and \proc preference.
 Each \at is assigned an initial processor using a round-robin policy, called the \at's \newterm{ideal} \proc.
 \Glspl{at} are distributed among the \procs according to their priority, preferring to match \ats to their ideal \proc and then to the last \proc they ran on.
 This approach is a variation of work stealing, where the stealing \proc restore the \at to its original \proc after running it, but mixed with priorities.
+This approach is a variation of work stealing, where the stealing \proc restores the \at to its original \proc after running it, but mixed with priorities.
 \paragraph{Apple OS X}
 …
 \end{displayquote}
+\todo{load balancing}
+There is very little documentation on the internals of this scheduler.
+However, the documentation does describe a feature set that is very similar to the Windows and Linux OS schedulers.
+Presumably, this means that the internals are also fairly similar overall.
 \subsection{User-Level Schedulers}
 By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \ats have the same user, and therefore cooperation is both feasible and probable.
+By comparison, user-level schedulers tend to be simpler, gather fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \ats have the same user, and therefore cooperation is both feasible and probable.
 \paragraph{Go}\label{GoSafePoint}
 Go's scheduler uses a randomized work-stealing algorithm that has a global run-queue (\emph{GRQ}) and each processor (\emph{P}) has both a fixed-size run-queue (\emph{LRQ}) and a high-priority next ``chair'' holding a single element~\cite{GITHUB:go,YTUBE:go}.
 Preemption is present, but only at safe-points,~\cite{go:safepoints} which are inserted detection code at various frequent access boundaries.
+Preemption is present, but only at safe points,~\cite{go:safepoints} which are detection code inserted at various frequent access boundaries.
 The algorithm is as follows :
 …
 Erlang is a functional language that supports concurrency in the form of processes: threads that share no data.
 It uses a kind of round-robin scheduler, with a mix of work sharing and stealing to achieve load balancing~\cite{:erlang}, where under-loaded workers steal from other workers, but overloaded workers also push work to other workers.
 This migration logic is directed by monitoring logic that evaluates the load a few times per seconds.
+This \glslink{atmig}{migration} logic is directed by monitoring logic that evaluates the load a few times per second.
 \paragraph{Intel\textregistered ~Threading Building Blocks}
 \newterm{Thread Building Blocks} (TBB) is Intel's task parallelism \cite{wiki:taskparallel} framework.
 It runs \newterm{jobs}, which are uninterruptable \ats that must always run to completion, on a pool of worker threads.
+It runs \newterm{jobs}, which are uninterruptible \ats that must always run to completion, on a pool of worker threads.
 TBB's scheduler is a variation of randomized work-stealing that also supports higher-priority graph-like dependencies~\cite{MAN:tbb/scheduler}.
 It schedules \ats as follows (where \textit{t} is the last \ats completed):
+It schedules \ats as follows (where \textit{t} is the last \at completed):
 \begin{displayquote}
         \begin{enumerate}
                 \item The task returned by \textit{t}@.execute()@
                 \item The successor of t if \textit{t} was its last completed predecessor.
                 \item A task popped from the end of the thread's own deque.
                 \item A task with affinity for the thread.
+                \item A task popped from the end of the thread's own queue.
+                \item A task with an affinity for the thread.
                 \item A task popped from approximately the beginning of the shared queue.
                 \item A task popped from the beginning of another randomly chosen thread's deque.
+                \item A task popped from the beginning of another randomly chosen thread's queue.
         \end{enumerate}
 …
 While the documentation only gives limited insight into the scheduling and load balancing approach, \cite{apple:gcd2} suggests a fairly classic approach.
 Each \proc has a queue of \ats to run, called \newterm{blocks}, which are drained in \glsxtrshort{fifo}.
+\todo{update: They seem to add the concept of dependent queues with clear ordering, where executing a block ends-up scheduling more blocks.
+In terms of semantics, these Dispatch Queues seem to be very similar to Intel\textregistered ~TBB \lstinline{execute()} and predecessor semantics.}
+GCD also has secondary queues, called \newterm{Dispatch Queues}, with clear ordering, where executing a block ends up scheduling more blocks.
+In terms of semantics, these Dispatch Queues seem to be very similar to Intel\textregistered ~TBB \lstinline{execute()} and predecessor semantics.
+The similarity of API and semantics between GCD and Intel\textregistered ~TBB suggest the underlying scheduling algorithms are similar.
 \paragraph{LibFibre}
 LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developed at the University of Waterloo.
 Similarly to Go, it uses a variation of work stealing with a global queue that is higher priority than stealing.
+LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a lightweight user-level threading framework developed at the University of Waterloo.
+Similarly to Go, it uses a variation of work stealing with a global queue that has a higher priority than stealing.
 Unlike Go, it does not have the high-priority next ``chair'' and does not use randomized work-stealing.

doc/theses/thierry_delisle_PhD/thesis/text/front.tex

-              rebf8ca5
+              r23a08aa0
                 \vspace*{2.0cm}
                 Waterloo, Ontario, Canada, 2021 \\
                 \vspace*{1.0cm}
                 \copyright\ Thierry Delisle 2021 \\
+                Waterloo, Ontario, Canada, 2022 \\
+                \vspace*{1.0cm}
+                \copyright\ Thierry Delisle 2022 \\
         \end{center}
 \end{titlepage}
 …
 \noindent
         The following served on the Examining Committee for this thesis. The decision of the Examining Committee is by majority vote.
+        \todo{External Examiners}
+\bigskip
 \noindent
+\begin{tabbing}
         Internal-External Member: \=  \kill % using longest text to define tab length
         External Examiner: \>  TBD \\
         \> TBD \\
+\bigskip
+\noindent
+\begin{tabbing}
+        Internal-External Member: \=  \kill % using longest text to define tab length
+        External Examiner: \>  Doug Lea \\
+        \> Professor, Computer Science Department \\
+        \> State University of New York at Oswego \\
 \end{tabbing}
 \bigskip
 …
 \begin{tabbing}
         Internal-External Member: \=  \kill % using longest text to define tab length
         Internal-External Member: \> TBD \\
         \> TBD \\
+        Internal-External Member: \> Patrick Lam \\
+        \> Associate Professor, Department of Electrical and Computer Engineering \\
         \> University of Waterloo \\
 \end{tabbing}
 …
 User-Level threading (M:N) is gaining popularity over kernel-level threading (1:1) in many programming languages.
 The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multi-core systems.
+The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multicore systems.
 Indeed, over-partitioning into small work-units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities.
 To manage these high levels of concurrency, the underlying runtime must efficiently schedule many user threads across a few kernel threads;
 which begs of the question of how many kernel threads are needed and should the number be dynamically reevaluated.
+which begs the question of how many kernel threads are needed and should the number be dynamically reevaluated.
 Furthermore, scheduling must prevent kernel threads from blocking, otherwise user-thread parallelism drops.
 When user-threading parallelism does drop, how and when should idle kernel-threads be put to sleep to avoid wasting CPU resources.
+When user-threading parallelism does drop, how and when should idle \glspl{kthrd} be put to sleep to avoid wasting CPU resources.
 Finally, the scheduling system must provide fairness to prevent a user thread from monopolizing a kernel thread;
 otherwise other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.
 This thesis analyses multiple scheduler systems, where each system attempts to fulfill the necessary requirements for user-level threading.
 The predominant technique for managing high levels of concurrency is sharding the ready-queue with one queue per kernel-thread and using some form of work stealing/sharing to dynamically rebalance workload shifts.
 Preventing kernel blocking is accomplish by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.
+otherwise, other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.
+This thesis analyses multiple scheduler systems, where each system attempts to fulfill the requirements for user-level threading.
+The predominant technique for managing high levels of concurrency is sharding the ready queue with one queue per \gls{kthrd} and using some form of work stealing/sharing to dynamically rebalance workload shifts.
+Preventing kernel blocking is accomplished by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.
 Fairness is handled through preemption and/or ad-hoc solutions, which leads to coarse-grained fairness with some pathological cases.
 …
 The new scheduler also includes support for implicit nonblocking \io, allowing applications to have more user-threads blocking on \io operations than there are \glspl{kthrd}.
 The implementation is based on @io_uring@, a recent addition to the Linux kernel, and achieves the same performance and fairness as systems using @select@, @epoll@, \etc.
 To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside of the application.
+To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside the application.
 \cleardoublepage
 …
 \phantomsection    % allows hyperref to link to the correct page
+% L I S T   O F   F I G U R E S
+% -----------------------------
+\addcontentsline{toc}{chapter}{List of Figures}
+\listoffigures
+\cleardoublepage
+\phantomsection         % allows hyperref to link to the correct page
 % L I S T   O F   T A B L E S
 % ---------------------------
 …
 \phantomsection         % allows hyperref to link to the correct page
-% L I S T   O F   F I G U R E S
-% -----------------------------
-\addcontentsline{toc}{chapter}{List of Figures}
-\listoffigures
-\cleardoublepage
-\phantomsection         % allows hyperref to link to the correct page
 % GLOSSARIES (Lists of definitions, abbreviations, symbols, etc. provided by the glossaries-extra package)
 % -----------------------------
 …
 \phantomsection         % allows hyperref to link to the correct page
-% TODOs and missing citations
-% -----------------------------
-\listofcits
-\listoftodos
-\cleardoublepage
-\phantomsection         % allows hyperref to link to the correct page
 % Change page numbering back to Arabic numerals
 \pagenumbering{arabic}

doc/theses/thierry_delisle_PhD/thesis/text/intro.tex

-              rebf8ca5
+              r23a08aa0
 \Gls{uthrding} (M:N) is gaining popularity over kernel-level threading (1:1) in many programming languages.
 The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multi-core systems.
 Indeed, over-partitioning into small work-units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities.
+The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multicore systems.
+Indeed, over-partitioning into small work units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities.
 To manage these high levels of concurrency, the underlying runtime must efficiently schedule many user threads across a few kernel threads;
 which begs of the question of how many kernel threads are needed and should the number be dynamically reevaluated.
+which begs the question of how many kernel threads are needed and should the number be dynamically reevaluated.
 Furthermore, scheduling must prevent kernel threads from blocking, otherwise user-thread parallelism drops.
 When user-threading parallelism does drop, how and when should idle kernel-threads be put to sleep to avoid wasting CPU resources.
 Finally, the scheduling system must provide fairness to prevent a user thread from monopolizing a kernel thread;
 otherwise other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.
+otherwise, other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.
 This thesis analyses multiple scheduler systems, where each system attempts to fulfill the necessary requirements for \gls{uthrding}.
 The predominant technique for managing high levels of concurrency is sharding the ready-queue with one queue per kernel-thread and using some form of work stealing/sharing to dynamically rebalance workload shifts.
 Preventing kernel blocking is accomplish by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.
 Fairness is handled through preemption and/or ad-hoc solutions, which leads to coarse-grained fairness with some pathological cases.
+This thesis analyzes multiple scheduler systems, where each system attempts to fulfill the requirements for \gls{uthrding}.
+The predominant technique for managing high levels of concurrency is sharding the ready queue with one queue per kernel thread and using some form of work stealing/sharing to dynamically rebalance workload shifts.
+Preventing kernel blocking is accomplished by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.
+Fairness is handled through preemption and/or ad hoc solutions, which leads to coarse-grained fairness with some pathological cases.
 After examining, testing and selecting specific approaches to these scheduling issues, a completely new scheduler was created and tested in the \CFA (C-for-all) user-threading runtime-system.
+After examining, testing and selecting specific approaches to these scheduling issues, a completely new scheduler was created and tested in the \CFA (C-for-all) user-threading runtime system.
 The goal of the new scheduler is to offer increased safety and productivity without sacrificing performance.
 The quality of the new scheduler is demonstrated by comparing it with other user-threading work-stealing schedulers with the aim of showing equivalent or better performance while offering better fairness.
+The quality of the new scheduler is demonstrated by comparing it with other user-threading work-stealing schedulers with, the aim of showing equivalent or better performance while offering better fairness.
 Chapter~\ref{intro} defines scheduling and its general goals.
 Chapter~\ref{existing} discusses how scheduler implementations attempt to achieve these goals, but all implementations optimize some workloads better than others.
 Chapter~\ref{cfaruntime} presents the relevant aspects of the \CFA runtime system that have a significant affect on the new scheduler design and implementation.
 Chapter~\ref{core} analyses different scheduler approaches, while looking for scheduler mechanisms that provide both performance and fairness.
+Chapter~\ref{cfaruntime} presents the relevant aspects of the \CFA runtime system that have a significant effect on the new scheduler design and implementation.
+Chapter~\ref{core} analyses different scheduler approaches while looking for scheduler mechanisms that provide both performance and fairness.
 Chapter~\ref{userio} covers the complex mechanisms that must be used to achieve nonblocking I/O to prevent the blocking of \glspl{kthrd}.
 Chapter~\ref{practice} presents the mechanisms needed to adjust the amount of parallelism, both manually and automatically.
 …
 \section{Scheduling}
+\section{Scheduling}\label{sched}
 Computer systems share multiple resources across many threads of execution, even on single-user computers like laptops or smartphones.
 On a computer system with multiple processors and work units (routines, coroutines, threads, programs, \etc), there exists the problem of mapping many different kinds of work units onto many different kinds of processors in an efficient manner, called \newterm{scheduling}.
+On a computer system with multiple processors and work units (routines, coroutines, threads, programs, \etc), there exists the problem of mapping many different kinds of work units onto many different kinds of processors efficiently, called \newterm{scheduling}.
 Scheduling systems are normally \newterm{open}, meaning new work arrives from an external source or is randomly spawned from an existing work unit.
 In general, work units without threads, like routines and coroutines, are self-scheduling, while work units with threads, like tasks and programs, are scheduled.
 For scheduled work-units, a scheduler takes a sequence of threads and attempts to run them to completion, subject to shared resource restrictions and utilization.
 A general-purpose dynamic-scheduler for an open system cannot anticipate work requests, so its performance is rarely optimal.
 Even with complete knowledge of arrive order and work, creating an optimal solution is a bin packing problem~\cite{wiki:binpak}.
+In an open system, a general-purpose dynamic scheduler cannot anticipate work requests, so its performance is rarely optimal.
+Even with complete knowledge of arrival order and work, creating an optimal solution is a bin packing problem~\cite{wiki:binpak}.
 However, optimal solutions are often not required: schedulers often produce excellent solutions, without needing optimality, by taking advantage of regularities in work patterns.
 Scheduling occurs at discreet points when there are transitions in a system.
 For example, a thread cycles through the following transitions during its execution.
+Scheduling occurs at discrete points when there are transitions in a system.
+For example, a \at cycles through the following transitions during its execution.
 \begin{center}
 \input{executionStates.pstex_t}
 …
 entering the system (new $\rightarrow$ ready)
 \item
 scheduler assigns a thread to a computing resource, \eg CPU (ready $\rightarrow$ running)
+scheduler assigns a \at to a computing resource, \eg CPU (ready $\rightarrow$ running)
 \item
 timer alarm for preemption (running $\rightarrow$ ready)
 \item
 long term delay versus spinning (running $\rightarrow$ blocked)
+long-term delay versus spinning (running $\rightarrow$ blocked)
 \item
 completion of delay, \eg network or I/O completion (blocked $\rightarrow$ ready)
 …
 normal completion or error, \eg segment fault (running $\rightarrow$ halted)
 \end{itemize}
 Key to scheduling is that a thread cannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system, \ie no self-scheduling among threads.
+Key to scheduling is that a \at cannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system, \ie no self-scheduling among threads.
 When the workload exceeds the capacity of the processors, \ie work cannot be executed immediately, it is placed on a queue for subsequent service, called a \newterm{ready queue}.
 …
 \end{tabular}
 \end{center}
 Beyond these two schedulers are a host of options, \eg adding an global shared queue to MQMS or adding multiple private queues with distinc characteristics.
+Beyond these two schedulers are a host of options, \eg adding a global shared queue to MQMS or adding multiple private queues with distinct characteristics.
 Once there are multiple resources and ready queues, a scheduler is faced with three major optimization criteria:
 …
 \noindent
 Essentially, all multi-processor computers have non-uniform memory access (NUMA), with one or more quantized steps to access data at different levels in the memory hierarchy.
+Essentially, all multiprocessor computers have non-uniform memory access (NUMA), with one or more quantized steps to access data at different levels in the memory hierarchy.
 When a system has a large number of independently executing threads, affinity becomes difficult because of \newterm{thread churn}.
 That is, threads must be scheduled on different processors to obtain high processors utilization because the number of threads $\ggg$ processors.
+That is, threads must be scheduled on different processors to obtain high processor utilization because the number of threads $\ggg$ processors.
 \item
 \newterm{contention}: safe access of shared objects by multiple processors requires mutual exclusion in some form, generally locking.\footnote{
 Lock-free data-structures do not involve locking but incur similar costs to achieve mutual exclusion.}
 Mutual exclusion cost and latency increases significantly with the number of processors access\-ing a shared object.
+Mutual exclusion cost and latency increase significantly with the number of processors access\-ing a shared object.
 \end{enumerate}
 …
 Since \CFA attempts to improve the safety and productivity of C, the new scheduler presented in this thesis attempts to achieve the same goals.
 More specifically, safety and productivity for scheduling means supporting a wide range of workloads so that programmers can rely on progress guarantees (safety) and more easily achieve acceptable performance (productivity).
+More specifically, safety and productivity for scheduling mean supporting a wide range of workloads so that programmers can rely on progress guarantees (safety) and more easily achieve acceptable performance (productivity).
 The new scheduler also includes support for implicit nonblocking \io, allowing applications to have more user-threads blocking on \io operations than there are \glspl{kthrd}.
 To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside of the application.
+To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside the application.
 As a research project, this work builds exclusively on newer versions of the Linux operating-system and gcc/clang compilers.
+As a research project, this work builds exclusively on newer versions of the Linux operating system and gcc/clang compilers.
 The new scheduler implementation uses several optimizations to successfully balance the cost of fairness against performance;
 some of these optimizations rely on interesting hardware optimizations only present on modern CPUs.
 The \io implementation is based on the @io_uring@ kernel-interface, a recent addition to the Linux kernel, because it purports to handle nonblocking \emph{file} and network \io.
+The \io implementation is based on the @io_uring@ kernel interface, a recent addition to the Linux kernel, because it purports to handle nonblocking \emph{file} and network \io.
 This decision allowed an interesting performance and fairness comparison with other threading systems using @select@, @epoll@, \etc.
 While the current \CFA release supports older versions of Linux ($\ge$~Ubuntu 16.04) and gcc/clang compilers ($\ge$~gcc 6.0), it is not the purpose of this project to find workarounds in these older systems to provide backwards compatibility.
 …
 \section{Contributions}\label{s:Contributions}
 This work provides the following scheduling contributions for advanced \gls{uthrding} runtime-systems:
+This work provides the following scheduling contributions for advanced \gls{uthrding} runtime systems:
 \begin{enumerate}[leftmargin=*]
 \item
 …
 A mechanism for adding fairness on top of MQMS algorithm through helping, used both for scalable scheduling algorithm and the user-level \glsxtrshort{io}.
 \item
 An optimization of the helping-mechanism for load balancing to reduce scheduling costs.
+An optimization of the helping mechanism for load balancing to reduce scheduling costs.
 \item
 An optimization for the alternative relaxed-list for load balancing to reduce scheduling costs in embarrassingly parallel cases.

doc/theses/thierry_delisle_PhD/thesis/text/io.tex

-              rebf8ca5
+              r23a08aa0
 \chapter{User Level \io}\label{userio}
 As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \io operations.
 Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating-system.
+As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \ats onto fewer \glspl{proc} using asynchronous \io operations.
+Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating system.
 \section{Kernel Interface}
 …
 In this context, ready means \emph{some} operation can be performed without blocking.
 It does not mean an operation returning \lstinline{EAGAIN} succeeds on the next try.
 For example, a ready read may only return a subset of requested bytes and the read must be issues again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}
 This mechanism is also crucial in determining when all \glspl{thrd} are blocked and the application \glspl{kthrd} can now block.
+For example, a ready read may only return a subset of requested bytes and the read must be issued again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}
+This mechanism is also crucial in determining when all \ats are blocked and the application \glspl{kthrd} can now block.
 There are three options to monitor file descriptors in Linux:\footnote{
 For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}.
 The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.},
+The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.}
 @select@~\cite{MAN:select}, @poll@~\cite{MAN:poll} and @epoll@~\cite{MAN:epoll}.
 All three of these options offer a system call that blocks a \gls{kthrd} until at least one of many file descriptors becomes ready.
 …
 Hence, if one \gls{kthrd} is managing the select calls, other threads can only add/remove to/from the manager's interest set through synchronized calls to update the interest set.
 However, these changes are only reflected when the manager makes its next call to @select@.
 Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/ttys it is waiting on never get data again.
+Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/TTYs it is waiting on never get data again.
 Often the I/O manager has a timeout, polls, or is sent a signal on changes to mitigate this problem.
-\begin{comment}
-From: Tim Brecht <brecht@uwaterloo.ca>
-Subject: Re: FD sets
-Date: Wed, 6 Jul 2022 00:29:41 +0000
-Large number of open files
---------------------------
-In order to be able to use more than the default number of open file
-descriptors you may need to:
-o increase the limit on the total number of open files /proc/sys/fs/file-max
-  (on Linux systems)
-o increase the size of FD_SETSIZE
-  - the way I often do this is to figure out which include file __FD_SETSIZE
-    is defined in, copy that file into an appropriate directory in ./include,
-    and then modify it so that if you use -DBIGGER_FD_SETSIZE the larger size
-    gets used
-  For example on a RH 9.0 distribution I've copied
-  /usr/include/bits/typesizes.h into ./include/i386-linux/bits/typesizes.h
-  Then I modify typesizes.h to look something like:
-  #ifdef BIGGER_FD_SETSIZE
-  #define __FD_SETSIZE            32767
-  #else
-  #define __FD_SETSIZE            1024
-  #endif
-  Note that the since I'm moving and testing the userver on may different
-  machines the Makefiles are set up to use -I ./include/$(HOSTTYPE)
-  This way if you redefine the FD_SETSIZE it will get used instead of the
-  default original file.
-\end{comment}
 \paragraph{\lstinline{poll}} is the next oldest option, and takes as input an array of structures containing the FD numbers rather than their position in an array of bits, allowing a more compact input for interest sets that contain widely spaced FDs.
 For small interest sets with densely packed FDs, the @select@ bit mask can take less storage, and hence, copy less information into the kernel.
 Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialize on every call.
 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \gls{kthrd}, while a manager thread is blocked in @poll@.
+Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialized on every call.
+Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \glspl{kthrd}, while a manager thread is blocked in @poll@.
 \paragraph{\lstinline{epoll}} follows after @poll@, and places the interest set in the kernel rather than the application, where it is managed by an internal \gls{kthrd}.
 …
 However, all three of these I/O systems have limitations.
 The @man@ page for @O_NONBLOCK@ mentions that ``[@O_NONBLOCK@] has no effect for regular files and block devices'', which means none of these three system calls are viable multiplexing strategies for these types of \io operations.
+Furthermore, @epoll@ has been shown to have problems with pipes and ttys~\cit{Peter's examples in some fashion}.
+Furthermore, TTYs can also be tricky to use since they can take different forms based on how the command is executed.
+For example, @epoll@ rejects FDs pointing to regular files or block devices, which includes @stdin@ when using shell redirections~\cite[\S~3.6]{MAN:bash}, but does not reject shell pipelines~\cite[\S~3.2.3]{MAN:bash}, which includes pipelines into @stdin@.
 Finally, none of these are useful solutions for multiplexing \io operations that do not have a corresponding file descriptor and can be awkward for operations using multiple file descriptors.
 …
 An alternative to @O_NONBLOCK@ is the AIO interface.
 Its interface lets programmers enqueue operations to be performed asynchronously by the kernel.
 Completions of these operations can be communicated in various ways: either by spawning a new \gls{kthrd}, sending a Linux signal, or by polling for completion of one or more operation.
 For this work, spawning a new \gls{kthrd} is counter-productive but a related solution is discussed in Section~\ref{io:morethreads}.
 Using interrupts handlers can also lead to fairly complicated interactions between subsystems and has non-trivial cost.
+Completions of these operations can be communicated in various ways: either by spawning a new \gls{kthrd}, sending a Linux signal, or polling for completion of one or more operations.
+For this work, spawning a new \gls{kthrd} is counterproductive but a related solution is discussed in Section~\ref{io:morethreads}.
+Using interrupt handlers can also lead to fairly complicated interactions between subsystems and has a non-trivial cost.
 Leaving polling for completion, which is similar to the previous system calls.
 AIO only supports read and write operations to file descriptors, it does not have the same limitation as @O_NONBLOCK@, \ie, the file descriptors can be regular files and blocked devices.
 It also supports batching multiple operations in a single system call.
 AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.
 For the purpose of \io multiplexing, @aio_suspend@ is the best interface.
+AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have been completed.
+For \io multiplexing, @aio_suspend@ is the best interface.
 However, even if AIO requests can be submitted concurrently, @aio_suspend@ suffers from the same limitation as @select@ and @poll@, \ie, the interest set cannot be dynamically changed while a call to @aio_suspend@ is in progress.
 AIO also suffers from the limitation of specifying which requests have completed, \ie programmers have to poll each request in the interest set using @aio_error@ to identify the completed requests.
+AIO also suffers from the limitation of specifying which requests have been completed, \ie programmers have to poll each request in the interest set using @aio_error@ to identify the completed requests.
 This limitation means that, like @select@ and @poll@ but not @epoll@, the time needed to examine polling results increases based on the total number of requests monitored, not the number of completed requests.
 Finally, AIO does not seem to be a popular interface, which I believe is due in part to this poor polling interface.
 …
 in
 ``some kind of arbitrary \textit{queue up asynchronous system call} model''.
 This description is actually quite close to the interface described in the next section.
+This description is quite close to the interface described in the next section.
 \subsection{\lstinline{io_uring}}
 …
 In addition to supporting reads and writes to any file descriptor like AIO, it supports other operations like @open@, @close@, @fsync@, @accept@, @connect@, @send@, @recv@, @splice@, \etc.
 On top of these, @io_uring@ adds many extras like avoiding copies between the kernel and user-space using shared memory, allowing different mechanisms to communicate with device drivers, and supporting chains of requests, \ie, requests that automatically trigger followup requests on completion.
+On top of these, @io_uring@ adds many extras like avoiding copies between the kernel and user space using shared memory, allowing different mechanisms to communicate with device drivers, and supporting chains of requests, \ie, requests that automatically trigger follow-up requests on completion.
 \subsection{Extra Kernel Threads}\label{io:morethreads}
 Finally, if the operating system does not offer a satisfactory form of asynchronous \io operations, an ad-hoc solution is to create a pool of \glspl{kthrd} and delegate operations to it to avoid blocking \glspl{proc}, which is a compromise for multiplexing.
 In the worst case, where all \glspl{thrd} are consistently blocking on \io, it devolves into 1-to-1 threading.
 However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \glspl{thrd} are ready to run.
+Finally, if the operating system does not offer a satisfactory form of asynchronous \io operations, an ad hoc solution is to create a pool of \glspl{kthrd} and delegate operations to it to avoid blocking \glspl{proc}, which is a compromise for multiplexing.
+In the worst case, where all \ats are consistently blocking on \io, it devolves into 1-to-1 threading.
+However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \ats are ready to run.
 This approach is used by languages like Go~\cite{GITHUB:go}, frameworks like libuv~\cite{libuv}, and web servers like Apache~\cite{apache} and NGINX~\cite{nginx}, since it has the advantage that it can easily be used across multiple operating systems.
 This advantage is especially relevant for languages like Go, which offer a homogeneous \glsxtrshort{api} across all platforms.
 As opposed to C, which has a very limited standard api for \io, \eg, the C standard library has no networking.
+As opposed to C, which has a very limited standard \glsxtrshort{api} for \io, \eg, the C standard library has no networking.
 \subsection{Discussion}
 …
 \section{Event-Engine}
 An event engine's responsibility is to use the kernel interface to multiplex many \io operations onto few \glspl{kthrd}.
 In concrete terms, this means \glspl{thrd} enter the engine through an interface, the event engine then starts an operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.
 The parked \glspl{thrd} are then rescheduled by the event engine once the desired operation has completed.
+In concrete terms, this means \ats enter the engine through an interface, the event engine then starts an operation and parks the calling \ats, returning control to the \gls{proc}.
+The parked \ats are then rescheduled by the event engine once the desired operation has been completed.
 \subsection{\lstinline{io_uring} in depth}\label{iouring}
 …
         \centering
         \input{io_uring.pstex_t}
         \caption[Overview of \lstinline{io_uring}]{Overview of \lstinline{io_uring} \smallskip\newline Two ring buffer are used to communicate with the kernel, one for completions~(right) and one for submissions~(left). The submission ring indexes into a pre-allocated array (denoted \emph{S}) instead.}
+        \caption[Overview of \lstinline{io_uring}]{Overview of \lstinline{io_uring} \smallskip\newline Two ring buffers are used to communicate with the kernel, one for completions~(right) and one for submissions~(left). The submission ring indexes into a pre-allocated array (denoted \emph{S}) instead.}
         \label{fig:iouring}
 \end{figure}
 …
 \item
 The SQE is filled according to the desired operation.
 This step is straight forward.
 The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in order to match submission and completion entries.
+This step is straightforward.
+The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled to match submission and completion entries.
 \item
 The SQE is submitted to the submission ring by appending the index of the SQE to the ring following regular ring buffer steps: \lstinline{buffer[head] = item; head++}.
 …
 The @io_uring_enter@ system call is protected by a lock inside the kernel.
 This protection means that concurrent call to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@.
+This protection means that concurrent calls to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@.
 It is possible to do the first three submission steps in parallel;
 however, doing so requires careful synchronization.
 …
 This restriction means \io request bursts may have to be subdivided and submitted in chunks at a later time.
 An important detail to keep in mind is that just like ``The cloud is just someone else's computer''\cite{xkcd:cloud}, asynchronous operations are just operation using someone else's threads.
 Indeed, asynchronous operation can require computation time to complete, which means that if this time is not taken from the thread that triggered the asynchronous operation, it must be taken from some other threads.
+An important detail to keep in mind is that just like ``The cloud is just someone else's computer''\cite{xkcd:cloud}, asynchronous operations are just operations using someone else's threads.
+Indeed, asynchronous operations can require computation time to complete, which means that if this time is not taken from the thread that triggered the asynchronous operation, it must be taken from some other threads.
 In this case, the @io_uring@ operations that cannot be handled directly in the system call must be delegated to some other \gls{kthrd}.
 To this end, @io_uring@ maintains multiple \glspl{kthrd} inside the kernel that are not exposed to the user.
 There are three kind of operations that can need the \glspl{kthrd}:
+Three kinds of operations that can need the \glspl{kthrd}:
 \paragraph{Operations using} @IOSQE_ASYNC@.
 …
 This is also a fairly simple case. As mentioned earlier in this chapter, [@O_NONBLOCK@] has no effect for regular files and block devices.
 @io_uring@ must also take this reality into account by delegating operations on regular files and block devices.
 In fact @io_uring@ maintains a pool of \glspl{kthrd} dedicated to these operations, which are referred to as \newterm{bounded workers}.
+In fact, @io_uring@ maintains a pool of \glspl{kthrd} dedicated to these operations, which are referred to as \newterm{bounded workers}.
 \paragraph{Unbounded operations that must be retried.}
 …
 @io_uring@ maintains a separate pool for these operations.
 The \glspl{kthrd} in this pool are referred to as \newterm{unbounded workers}.
 Unbounded workers are also responsible of handling operations using @IOSQE_ASYNC@.
+Unbounded workers are also responsible for handling operations using @IOSQE_ASYNC@.
 @io_uring@ implicitly spawns and joins both the bounded and unbounded workers based on its evaluation of the needs of the workload.
 This effectively encapsulates the work that is needed when using @epoll@.
 Indeed, @io_uring@ does not change Linux's underlying handling of \io opeartions, it simply offers an asynchronous \glsxtrshort{api} on top of the existing system.
+Indeed, @io_uring@ does not change Linux's underlying handling of \io operations, it simply offers an asynchronous \glsxtrshort{api} on top of the existing system.
 \subsection{Multiplexing \io: Submission}
 The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made in the submission side.
+The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made on the submission side.
 While there is freedom in designing the submission side, there are some realities of @io_uring@ that must be taken into account.
 It is possible to do the first steps of submission in parallel;
 …
 As described in Chapter~\ref{practice}, this does not translate into constant CPU usage.}.
 Note that once an operation completes, there is nothing that ties it to the @io_uring@ instance that handled it.
 There is nothing preventing a new operation with, \eg the same file descriptors to a different @io_uring@ instance.
+Nothing preventing a new operation, with for example the same file descriptor, to use a different @io_uring@ instance.
 A complicating aspect of submission is @io_uring@'s support for chains of operations, where the completion of an operation triggers the submission of the next operation on the link.
 …
 Support for this feature can be fulfilled simply by supporting arbitrary user code between allocation and submission.
 Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} use a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.
+Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \proc, in decoupled pools, \ie, a pool of \procs using a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.
 These three sharding approaches are analyzed.
 \subsubsection{Private Instances}
 The private approach creates one ring instance per \gls{proc}, \ie one-to-one coupling.
 This alleviates the need for synchronization on the submissions, requiring only that \glspl{thrd} are not time-sliced during submission steps.
 This requirement is the same as accessing @thread_local@ variables, where a \gls{thrd} is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data.
 This failure is the serially reusable problem~\cite{SeriallyReusable}.
 Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in allocation order.\footnote{
 To remove this requirement, a \gls{thrd} needs the ability to ``yield to a specific \gls{proc}'', \ie, park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.}
+This alleviates the need for synchronization on the submissions, requiring only that \ats are not time-sliced during submission steps.
+This requirement is the same as accessing @thread_local@ variables, where a \at is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data.
+This failure is the \newterm{serially reusable problem}~\cite{SeriallyReusable}.
+Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in order of allocation.\footnote{
+To remove this requirement, a \at needs the ability to ``yield to a specific \gls{proc}'', \ie, \park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.}
 From the subsystem's point of view, the allocation and submission are sequential, greatly simplifying both.
 In this design, allocation and submission form a partitioned ring buffer as shown in Figure~\ref{fig:pring}.
 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regards to when to perform the system call.
 Possible options are: when the \gls{proc} runs out of \glspl{thrd} to run, after running a given number of \glspl{thrd}, \etc.
+Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regard to when to perform the system call.
+Possible options are: when the \gls{proc} runs out of \ats to run, after running a given number of \ats, \etc.
 \begin{figure}
         \centering
         \input{pivot_ring.pstex_t}
         \caption[Partitioned ring buffer]{Partitioned ring buffer \smallskip\newline Allocated sqes are appending to the first partition.
+        \caption[Partitioned ring buffer]{Partitioned ring buffer \smallskip\newline Allocated sqes are appended to the first partition.
         When submitting, the partition is advanced.
         The kernel considers the partition as the head of the ring.}
 …
 This approach has the advantage that it does not require much of the synchronization needed in a shared approach.
 However, this benefit means \glspl{thrd} submitting \io operations have less flexibility: they cannot park or yield, and several exceptional cases are handled poorly.
 Instances running out of SQEs cannot run \glspl{thrd} wanting to do \io operations.
 In this case, the \io \gls{thrd} needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed.
+However, this benefit means \ats submitting \io operations have less flexibility: they cannot \park or yield, and several exceptional cases are handled poorly.
+Instances running out of SQEs cannot run \ats wanting to do \io operations.
+In this case, the \io \at needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed.
 A more involved version of this approach tries to solve these problems using a pattern called \newterm{helping}.
 \Glspl{thrd} that cannot submit \io operations, either because of an allocation failure or migration to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.
 While there is still the strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \glspl{thrd} to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request.
 Imagine a simple scenario with two \glspl{thrd} on two \glspl{proc}, where one \gls{thrd} submits an \io operation and then sets a flag, while the other \gls{thrd} spins until the flag is set.
 Assume both \glspl{thrd} are running on the same \gls{proc}, and the \io \gls{thrd} is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \gls{thrd}.
 In this case, the helping solution has the \io \gls{thrd} append an \io object to the submission list of the first \gls{proc}, where the allocation was made.
 No other \gls{proc} can help the \gls{thrd} since @io_uring@ instances are strongly coupled to \glspl{proc}.
 However, the \io \gls{proc} is unable to help because it is executing the spinning \gls{thrd} resulting in a deadlock.
 While this example is artificial, in the presence of many \glspl{thrd}, it is possible for this problem to arise ``in the wild''.
+\ats that cannot submit \io operations, either because of an allocation failure or \glslink{atmig}{migration} to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.
+While there is still a strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \ats to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request.
+Imagine a simple scenario with two \ats on two \glspl{proc}, where one \at submits an \io operation and then sets a flag, while the other \at spins until the flag is set.
+Assume both \ats are running on the same \gls{proc}, and the \io \at is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \at.
+In this case, the helping solution has the \io \at append an \io object to the submission list of the first \gls{proc}, where the allocation was made.
+No other \gls{proc} can help the \at since @io_uring@ instances are strongly coupled to \glspl{proc}.
+However, the \io \gls{proc} is unable to help because it is executing the spinning \at resulting in a deadlock.
+While this example is artificial, in the presence of many \ats, this problem can arise ``in the wild''.
 Furthermore, this pattern is difficult to reliably detect and avoid.
 Once in this situation, the only escape is to interrupted the spinning \gls{thrd}, either directly or via some regular preemption, \eg time slicing.
 Having to interrupt \glspl{thrd} for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect.
+Once in this situation, the only escape is to interrupt the spinning \at, either directly or via some regular preemption, \eg time slicing.
+Having to interrupt \ats for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect.
 Interrupts are needed here entirely because the \gls{proc} is tied to an instance it is not using.
 Therefore, a more satisfying solution is for the \gls{thrd} submitting the operation to notice that the instance is unused and simply go ahead and use it.
+Therefore, a more satisfying solution is for the \at submitting the operation to notice that the instance is unused and simply go ahead and use it.
 This approach is presented shortly.
 \subsubsection{Public Instances}
 The public approach creates decoupled pools of @io_uring@ instances and processors, \ie without one-to-one coupling.
 \Glspl{thrd} attempting an \io operation pick one of the available instances and submit the operation to that instance.
 Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \glspl{thrd} running on more than one \gls{proc} can attempt to submit to the same instance concurrently.
+\ats attempting an \io operation pick one of the available instances and submit the operation to that instance.
+Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \ats running on more than one \gls{proc} can attempt to submit to the same instance concurrently.
 Because @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects:
 \begin{itemize}
 …
 \item
 The scheme to route \io requests to specific @io_uring@ instances does not introduce contention.
 This aspect has an oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
+This aspect has oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.
 \end{itemize}
 Allocation in this scheme is fairly easy.
 Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written to safely and have a field called @user_data@ that the kernel only reads to copy to @cqe@s.
 Allocation also requires no ordering guarantee as all free SQEs are interchangeable.
+Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written-to safely and have a field called @user_data@ that the kernel only reads to copy to CQEs.
+Allocation also does not require ordering guarantees as all free SQEs are interchangeable.
 The only added complexity is that the number of SQEs is fixed, which means allocation can fail.
 Allocation failures need to be pushed to a routing algorithm: \glspl{thrd} attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.
 Furthermore, the routing algorithm should block operations up-front, if none of the instances have available SQEs.
 Once an SQE is allocated, \glspl{thrd} insert the \io request information, and keep track of the SQE index and the instance it belongs to.
 Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread-safe, and then the kernel must be notified using the @io_uring_enter@ system call.
 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean a \lstinline{sqe} multiple times in the ring buffer, which is undefined behaviour.
 However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations complete.
+Allocation failures need to be pushed to a routing algorithm: \ats attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.
+Furthermore, the routing algorithm should block operations upfront if none of the instances have available SQEs.
+Once an SQE is allocated, \ats insert the \io request information and keep track of the SQE index and the instance it belongs to.
+Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread safe, and then the kernel must be notified using the @io_uring_enter@ system call.
+The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean an SQE multiple times in the ring buffer, which is undefined behaviour.
+However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations are complete.
 Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency.
 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted.
 Balancing submission can be handled by either designating one of the submitting \glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd} mentioned later in this section.
 Ideally, when multiple \glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \glspl{thrd} is designated to do the system call on behalf of the others, called the \newterm{submitter}.
+Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long periods before being submitted.
+Balancing submission can be handled by either designating one of the submitting \ats as the \at responsible for the system call for the current batch of SQEs or by having some other party regularly submit all ready SQEs, \eg, the poller \at mentioned later in this section.
+Ideally, when multiple \ats attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \ats is designated to do the system call on behalf of the others, called the \newterm{submitter}.
 However, in practice, \io requests must be handed promptly so there is a need to guarantee everything missed by the current submitter is seen by the next one.
 Indeed, as long as there is a ``next'' submitter, \glspl{thrd} submitting new \io requests can move on, knowing that some future system call includes their request.
 Once the system call is done, the submitter must also free SQEs so that the allocator can reused them.
 Finally, the completion side is much simpler since the @io_uring@ system-call enforces a natural synchronization point.
 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \glspl{thrd}.
 Since CQEs only own a signed 32 bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}.
+Indeed, as long as there is a ``next'' submitter, \ats submitting new \io requests can move on, knowing that some future system call includes their request.
+Once the system call is done, the submitter must also free SQEs so that the allocator can reuse them.
+Finally, the completion side is much simpler since the @io_uring@ system call enforces a natural synchronization point.
+Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \ats.
+Since CQEs only own a signed 32-bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}.
 If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events.
 A simple approach to polling is to allocate a \gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd} poll their respective instances when scheduled.
+A simple approach to polling is to allocate a \at per @io_uring@ instance and simply let the poller \ats poll their respective instances when scheduled.
 With the pool of SEQ instances approach, the big advantage is that it is fairly flexible.
 It does not impose restrictions on what \glspl{thrd} submitting \io operations can and cannot do between allocations and submissions.
+It does not impose restrictions on what \ats submitting \io operations can and cannot do between allocations and submissions.
 It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@.
 The down side to this approach is that many of the steps used for submitting need complex synchronization to work properly.
 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \glspl{thrd} are already queued up waiting for SQEs and handle SQEs being freed.
+The downside to this approach is that many of the steps used for submitting need complex synchronization to work properly.
+The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \ats are already queued up waiting for SQEs and handle SQEs being freed.
 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused, and handle the kernel returning @EBUSY@.
 All this synchronization has a significant cost, and compared to the private-instance approach, this synchronization is entirely overhead.
+Compared to the private-instance approach, all this synchronization has a significant cost this synchronization is entirely overhead.
 \subsubsection{Instance borrowing}
 Both of the prior approaches have undesirable aspects that stem from tight or loose coupling between @io_uring@ and \glspl{proc}.
 The first approach suffers from tight coupling causing problems when a \gls{proc} does not benefit from the coupling.
 The second approach suffers from loose coupling causing operations to have synchronization overhead, which tighter coupling avoids.
+The second approach suffers from loose couplings causing operations to have synchronization overhead, which tighter coupling avoids.
 When \glspl{proc} are continuously issuing \io operations, tight coupling is valuable since it avoids synchronization costs.
 However, in unlikely failure cases or when \glspl{proc} are not using their instances, tight coupling is no longer advantageous.
 …
 In this approach, each cluster, see Figure~\ref{fig:system}, owns a pool of @io_uring@ instances managed by an \newterm{arbiter}.
 When a \gls{thrd} attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance.
 This instance is now bound to the \gls{proc} the \gls{thrd} is running on.
 This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.
+When a \at attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance.
+This instance is now bound to the \gls{proc} the \at is running on.
+This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial \io state.
 This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at a time, akin to the private instances approach.
 However, it differs in that revocation by the arbiter means this approach does not suffer from the deadlock scenario described above.
 …
         \item The current \gls{proc} does not hold an instance.
         \item The current instance does not have sufficient SQEs to satisfy the request.
         \item The current \gls{proc} has a wrong instance, this happens if the submitting \gls{thrd} context-switched between allocation and submission, called \newterm{external submissions}.
+        \item The current \gls{proc} has a wrong instance, this happens if the submitting \at context-switched between allocation and submission, called \newterm{external submissions}.
 \end{enumerate}
 However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their instance ownership is not being revoked, which is accomplished by a lock-\emph{less} handshake.\footnote{
 Note the handshake is not lock \emph{free} since it lacks the proper progress guarantee.}
+Note the handshake is not lock-\emph{free} since it lacks the proper progress guarantee.}
 A \gls{proc} raises a local flag before using its borrowed instance and checks if the instance is marked as revoked or if the arbiter has raised its flag.
 If not, it proceeds, otherwise it delegates the operation to the arbiter.
 …
 Correspondingly, before revoking an instance, the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag.
 Only then does it reclaim the instance and potentially assign it to an other \gls{proc}.
+Only then does it reclaim the instance and potentially assign it to another \gls{proc}.
 The arbiter maintains four lists around which it makes its decisions:
 …
 \paragraph{Pending Allocations} are handled by the arbiter when it has available instances and can directly hand over the instance and satisfy the request.
 Otherwise, it must hold onto the list of threads until SQEs are made available again.
+Otherwise, it must hold on to the list of threads until SQEs are made available again.
 This handling is more complex when an allocation requires multiple SQEs, since the arbiter must make a decision between satisfying requests in FIFO ordering or for fewer SQEs.
 While an arbiter has the potential to solve many of the problems mentioned above, it also introduces a significant amount of complexity.
 Tracking which processors are borrowing which instances and which instances have SQEs available ends-up adding a significant synchronization prelude to any I/O operation.
+Tracking which processors are borrowing which instances and which instances have SQEs available ends up adding a significant synchronization prelude to any I/O operation.
 Any submission must start with a handshake that pins the currently borrowed instance, if available.
 An attempt to allocate is then made, but the arbiter can concurrently be attempting to allocate from the same instance from a different \gls{hthrd}.
 Once the allocation is completed, the submission must check that the instance is still burrowed before attempting to flush.
 These synchronization steps turn out to have a similar cost to the multiple shared-instances approach.
 Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end-up cycling the processors, which leads to significant cache deterioration.
+Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end up cycling the processors, which leads to significant cache deterioration.
 For these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice.
-\subsubsection{Private Instances V2}
-% Verbs of this design
-% Allocation: obtaining an sqe from which to fill in the io request, enforces the io instance to use since it must be the one which provided the sqe. Must interact with the arbiter if the instance does not have enough sqe for the allocation. (Typical allocation will ask for only one sqe, but chained sqe must be allocated from the same context so chains of sqe must be allocated in bulks)
-% Submission: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.
-% Flushing: Taking all the sqes that were submitted and making them visible to the kernel, also counting them in order to figure out what to_submit should be. Must be thread-safe with submission. Has to interact with the Arbiter if there are external submissions. Can't simply use a protected queue because adding to the array is not safe if the ring is still available for submitters. Flushing must therefore: check if there are external pending requests if so, ask the arbiter to flush otherwise use the fast flush operation.
-% Collect: Once the system call is done, it returns how many sqes were consumed by the system. These must be freed for allocation. Must interact with the arbiter to notify that things are now ready.
-% Handle: process all the produced cqe. No need to interact with any of the submission operations or the arbiter.
-% alloc():
-%       proc.io->in_use = true, __ATOMIC_ACQUIRE
-%       if cltr.io.flag || !proc.io || proc.io->flag:
-%               return alloc_slow(cltr.io, proc.io)
-%       a = alloc_fast(proc.io)
-%       if a:
-%               proc.io->in_use = false, __ATOMIC_RELEASE
-%               return a
-%       return alloc_slow(cltr.io)
-% alloc_fast()
-%       left = proc.io->submit_q.free.tail - proc.io->submit_q.free.head
-%       if num_entries - left < want:
-%               return None
-%       a = ready[head]
-%       head = head + 1, __ATOMIC_RELEASE
-% alloc_slow()
-%       cltr.io.flag = true, __ATOMIC_ACQUIRE
-%       while(proc.io && proc.io->in_use) pause;
-% submit(a):
-%       proc.io->in_use = true, __ATOMIC_ACQUIRE
-%       if cltr.io.flag || proc.io != alloc.io || proc.io->flag:
-%               return submit_slow(cltr.io)
-%       submit_fast(proc.io, a)
-%       proc.io->in_use = false, __ATOMIC_RELEASE
-% polling()
-%       loop:
-%               yield
-%               flush()
-%               io_uring_enter
-%               collect
-%               handle()
 \section{Interface}
 The last important part of the \io subsystem is its interface.
 There are multiple approaches that can be offered to programmers, each with advantages and disadvantages.
 The new \io subsystem can replace the C runtime API or extend it, and in the later case, the interface can go from very similar to vastly different.
+Multiple approaches can be offered to programmers, each with advantages and disadvantages.
+The new \io subsystem can replace the C runtime API or extend it, and in the latter case, the interface can go from very similar to vastly different.
 The following sections discuss some useful options using @read@ as an example.
 The standard Linux interface for C is :
+The standard Linux interface for C is:
 \begin{cfa}
 ssize_t read(int fd, void *buf, size_t count);
 …
 However, this approach also entails a plethora of subtle technical challenges, which generally boils down to making a perfect replacement.
 If the \CFA interface replaces only \emph{some} of the calls to glibc, then this can easily lead to esoteric concurrency bugs.
 Since the gcc ecosystems does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible.
+Since the gcc ecosystem does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible.
 \subsection{Synchronous Extension}
 …
 It comes with the caveat that any code attempting to use it must be recompiled, which is a problem considering the amount of existing legacy C binaries.
 However, it has the advantage of implementation simplicity.
 Finally, there is a certain irony to using a blocking synchronous interfaces for a feature often referred to as ``non-blocking'' \io.
+Finally, there is a certain irony to using a blocking synchronous interface for a feature often referred to as ``non-blocking'' \io.
 \subsection{Asynchronous Extension}
 …
 This offers more flexibility to users wanting to fully utilize all of the @io_uring@ features.
 However, it is not the most user-friendly option.
 It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricting users to usages that are compatible with how \CFA internally uses @io_uring@.
+It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricts users to usages that are compatible with how \CFA internally uses @io_uring@.

doc/theses/thierry_delisle_PhD/thesis/text/practice.tex

-              rebf8ca5
+              r23a08aa0
 } // delete 4 kernel threads
 \end{cfa}
 Dynamically allocated processors can be deleted an any time, \ie their lifetime exceeds the block of creation.
+Dynamically allocated processors can be deleted at any time, \ie their lifetime exceeds the block of creation.
 The consequence is that the scheduler and \io subsystems must know when these \procs come in and out of existence and roll them into the appropriate scheduling algorithms.
 \section{Manual Resizing}
 Manual resizing is expected to be a rare operation.
 Programmers normally create/delete processors on a clusters at startup/teardown.
+Programmers normally create/delete processors on a cluster at startup/teardown.
 Therefore, dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state.
 As such, all internal scheduling arrays that are sized based on the number of \procs need to be @realloc@ed.
 This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or for any other reason.
+This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or any other reason.
 There are no performance requirements, within reason, for resizing since it is expected to be rare.
 However, this operation has strict correctness requirements since updating and idle sleep can easily lead to deadlocks.
 It should also avoid as much as possible any effect on performance when the number of \procs remain constant.
+It should also avoid as much as possible any effect on performance when the number of \procs remains constant.
 This later requirement prohibits naive solutions, like simply adding a global lock to the ready-queue arrays.
 \subsection{Read-Copy-Update}
 One solution is to use the Read-Copy-Update pattern~\cite{wiki:rcu}.
 In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempt an Indiana Jones Switch to replace the original with the copy.
+In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempting an Indiana Jones Switch to replace the original with the copy.
 This approach has the advantage that it may not need any synchronization to do the switch.
 However, there is a race where \procs still use the original data structure after the copy is switched.
 This race not only requires adding a memory-reclamation scheme, it also requires that operations made on the stale original version are eventually moved to the copy.
+This race not only requires adding a memory-reclamation scheme, but it also requires that operations made on the stale original version are eventually moved to the copy.
 Specifically, the original data structure must be kept until all \procs have witnessed the change.
 …
 If all operations need synchronization, then the overall cost of this technique is likely to be similar to an uncontended lock approach.
 In addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges.
 Especially merging subqueues while having a minimal impact on fairness and locality.
 For example, given a linked-list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure.
+Especially merging sub-queues while having a minimal impact on fairness and locality.
+For example, given a linked list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure.
 If the list supports arbitrary insertions, then inconsistencies in the tail pointer do not break the list;
 however, ordering may not be preserved.
 …
 A simpler approach is to use a \newterm{Readers-Writer Lock}~\cite{wiki:rwlock}, where the resizing requires acquiring the lock as a writer while simply enqueueing/dequeuing \ats requires acquiring the lock as a reader.
 Using a Readers-Writer lock solves the problem of dynamically resizing and leaves the challenge of finding or building a lock with sufficient good read-side performance.
 Since this approach is not a very complex challenge and an ad-hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.
+Since this approach is not a very complex challenge and an ad hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.
 To maximize reader scalability, readers should not contend with each other when attempting to acquire and release a critical section.
 To achieve this goal requires each reader to have its own memory to mark as locked and unlocked.
 The read acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock.
 The write acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks.
 Acquiring all the local read locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer.
+Achieving this goal requires that each reader have its own memory to mark as locked and unlocked.
+The read-acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock.
+The writer acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks.
+Acquiring all the local read-locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer.
 Figure~\ref{f:SpecializedReadersWriterLock} shows the outline for this specialized readers-writer lock.
 The lock in nonblocking, so both readers and writers spin while the lock is held.
 This very wide sharding strategy means that readers have very good locality, since they only ever need to access two memory location.
+This very wide sharding strategy means that readers have very good locality since they only ever need to access two memory locations.
 \begin{figure}
 …
 \section{Idle-Sleep}\label{idlesleep}
 While manual resizing of \procs is expected to be rare, the number of \ats can vary significantly over an application's lifetime, which means there are times when there are too few or too many \procs.
 For this work, it is the programer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue.
+For this work, it is the programmer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue.
 This leaves too many \procs when there are not enough \ats for all the \procs to be useful.
 These idle \procs cannot be removed because their lifetime is controlled by the application, and only the application knows when the number of \ats may increase or decrease.
 …
 Because idle sleep is spurious, this data structure has strict performance requirements, in addition to strict correctness requirements.
 Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ on a pthread semaphore.
 The complexity here is to support \at parking and unparking, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity.
+The complexity here is to support \at \glslink{atblock}{parking} and \glslink{atsched}{unparking}, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity.
 Finally, the scheduler needs a heuristic to determine when to block and unblock an appropriate number of \procs.
 However, this third challenge is outside the scope of this thesis because developing a general heuristic is complex enough to justify its own work.
 Therefore, the \CFA scheduler simply follows the ``Race-to-Idle''~\cite{Albers12} approach where a sleeping \proc is woken any time a \at becomes ready and \procs go to idle sleep anytime they run out of work.
 An interesting sub-part of this heuristic is what to do with bursts of \ats that become ready.
 Since waking up a sleeping \proc can have notable latency, it is possible multiple \ats become ready while a single \proc is waking up.
 This facts begs the question, if many \procs are available, how many should be woken?
 If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelisation.
 If the ready \ats will run for a short very short time, waking many \procs may be wasteful.
+An interesting subpart of this heuristic is what to do with bursts of \ats that become ready.
+Since waking up a sleeping \proc can have notable latency, multiple \ats may become ready while a single \proc is waking up.
+This fact begs the question, if many \procs are available, how many should be woken?
+If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelization.
+If the ready \ats will run for a very short time, waking many \procs may be wasteful.
 As mentioned, a heuristic to handle these complex cases is outside the scope of this thesis, the behaviour of the scheduler in this particular case is left unspecified.
 \section{Sleeping}
 As usual, the corner-stone of any feature related to the kernel is the choice of system call.
+As usual, the cornerstone of any feature related to the kernel is the choice of system call.
 In terms of blocking a \gls{kthrd} until some event occurs, the Linux kernel has many available options.
 \subsection{\lstinline{pthread_mutex}/\lstinline{pthread_cond}}
 The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe park/unpark of a \gls{kthrd} to/from a @pthread_cond@.
+The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe \park/\unpark of a \gls{kthrd} to/from a @pthread_cond@.
 While this approach works for \glspl{kthrd} waiting among themselves, \io operations do not provide a mechanism to signal @pthread_cond@s.
 For \io results to wake a \proc waiting on a @pthread_cond@ means a different \glspl{kthrd} must be woken up first, which then signals the \proc.
+For \io results to wake a \proc waiting on a @pthread_cond@ means a different \gls{kthrd} must be woken up first, which then signals the \proc.
 \subsection{\lstinline{io_uring} and Epoll}
 …
 \subsection{Event FDs}
 Another interesting approach is to use an event file descriptor\cite{eventfd}.
+Another interesting approach is to use an event file descriptor\cite{MAN:eventfd}.
 This Linux feature is a file descriptor that behaves like \io, \ie, uses @read@ and @write@, but also behaves like a semaphore.
 Indeed, all reads and writes must use a word-sized values, \ie 64 or 32 bits.
 Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero out the buffer and return the buffer values so far.\footnote{
+Indeed, all reads and writes must use word-sized values, \ie 64 or 32 bits.
+Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero-out the buffer and return the buffer values so far.\footnote{
 This behaviour is without the \lstinline{EFD_SEMAPHORE} flag, which changes the behaviour of \lstinline{read} but is not needed for this work.}
 If a read is made while the buffer is already 0, the read blocks until a non-0 value is added.
 …
 \section{Tracking Sleepers}
 Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc.
+Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly, it requires a concurrent \emph{handshake} so that no \at is stranded on a ready queue with no active \proc.
 The classic challenge occurs when a \at is made ready while a \proc is going to sleep: there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at.
 Since \ats can be made ready by timers, \io operations, or other events outside a cluster, this race can occur even if the \proc going to sleep is the only \proc awake.
 …
 The handshake closing the race is done with both the notifier and the idle \proc executing two ordered steps.
 The notifier first make sure the newly ready \at is visible to \procs searching for \ats, and then attempt to notify an idle \proc.
+The notifier first makes sure the newly ready \at is visible to \procs searching for \ats, and then attempts to notify an idle \proc.
 On the other side, \procs make themselves visible as idle \procs and then search for any \ats they may have missed.
 Unlike regular work-stealing, this search must be exhaustive to make sure that pre-existing \at is missed.
 These steps from both sides guarantee that if the search misses a newly ready \at, then the notifier is guaranteed to see at least one idle \proc.
 Conversly, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search.
+Conversely, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search.
 Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers.
 Contention can be tolerated for \procs attempting to sleep or wake-up because these \procs are not doing useful work, and therefore, not contributing to overall performance.
+Contention can be tolerated for \procs attempting to sleep or wake up because these \procs are not doing useful work, and therefore, not contributing to overall performance.
 However, notifying, checking if a \proc must be woken-up, and doing so if needed, can significantly affect overall performance and must be low cost.
 \subsection{Sleepers List}
 Each cluster maintains a list of idle \procs, organized as a stack.
 This ordering allows \procs at the tail to stay in idle sleep for extended period of times while those at the head of the list wake-up for bursts of activity.
+This ordering allows \procs at the tail to stay in idle sleep for extended periods while those at the head of the list wake up for bursts of activity.
 Because of unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \procs handle as much of the work as possible.
 The idle \procs maintain the stack of sleepers among themselves and notifying a sleeping \proc takes as little work as possible.
 …
 This approach also simplifies notification.
 Indeed, \procs not only need to be notify when a new \at is readied, but also must be notified during manual resizing, so the \gls{kthrd} can be joined.
+Indeed, \procs not only need to be notified when a new \at is readied, but must also be notified during manual resizing, so the \gls{kthrd} can be joined.
 These requirements mean whichever entity removes idle \procs from the sleeper list must be able to do so in any order.
 Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure.
 The single lock also means the notification process simply needs to wake-up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest.
+The single lock also means the notification process simply needs to wake up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest.
 \subsection{Reducing Latency}
 As mentioned in this section, \procs going to sleep for extremely short periods of time is likely in certain scenarios.
 Therefore, the latency of doing a system call to read from and writing to an event @fd@ can negatively affect overall performance in a notable way.
+As mentioned in this section, \procs going to sleep for extremely short periods is likely in certain scenarios.
+Therefore, the latency of doing a system call to read from and write to an event @fd@ can negatively affect overall performance notably.
 Hence, it is important to reduce latency and contention of the notification as much as possible.
 Figure~\ref{fig:idle1} shows the basic idle-sleep data structure.
 …
 The woken \proc then updates the atomic pointer, while it is updating the head of the list, as it removes itself from the list.
 Notifiers that obtained a @NULL@ in the exchange simply move on knowing that another notifier is already waking a \proc.
 This behaviour is equivalent to having multiple notifier write to the @fd@ since reads consume all previous writes.
 Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares witht the latency of \procs waking up.
+This behaviour is equivalent to having multiple notifiers write to the @fd@ since reads consume all previous writes.
+Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares with the latency of \procs waking up.
 As mentioned in section~\ref{idlesleep}, there is no optimal approach to handle these bursts.
 It is therefore difficult to justify the cost of any extra synchronization here.
 …
 The next optimization is to avoid the latency of the event @fd@, which can be done by adding what is effectively a binary benaphore\cite{schillings1996engineering} in front of the event @fd@.
 The benaphore over the event @fd@ logically provides a three state flag to avoid unnecessary system calls, where the states are expressed explicit in Figure~\ref{fig:idle:state}.
 A \proc begins its idle sleep by adding itself to the idle list before searching for an \at.
+The benaphore over the event @fd@ logically provides a three-state flag to avoid unnecessary system calls, where the states are expressed explicitly in Figure~\ref{fig:idle:state}.
+A \proc begins its idle sleep by adding itself to the idle list before searching for a \at.
 In the process of adding itself to the idle list, it sets the state flag to @SEARCH@.
 If no \ats can be found during the search, the \proc then confirms it is going to sleep by atomically swapping the state to @SLEEP@.
 If the previous state is still @SEARCH@, then the \proc does read the event @fd@.
 Meanwhile, notifiers atomically exchange the state to @AWAKE@ state.
+Meanwhile, notifiers atomically exchange the state to the @AWAKE@ state.
 If the previous state is @SLEEP@, then the notifier must write to the event @fd@.
 However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event @fd@ can be omitted, which reduces latency notably.
 These extensions leads to the final data structure shown in Figure~\ref{fig:idle}.
+These extensions lead to the final data structure shown in Figure~\ref{fig:idle}.
 \begin{figure}
         \centering
         \input{idle_state.pstex_t}
         \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three state flag is added to the event \lstinline{fd}.}
+        \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three-state flag is added to the event \lstinline{fd}.}
         \label{fig:idle:state}
 \end{figure}

doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex

-              rebf8ca5
+              r23a08aa0
 \section{C Threading}
 \Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@.
+\Celeven introduced threading features, such as the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@.
 Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC).
 While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC.
 …
 \section{M:N Threading}\label{prev:model}
 Threading in \CFA is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \glspl{thrd} and switch among \glspl{thrd} liberally without many concerns for performance.
+Threading in \CFA is based on \Gls{uthrding}, where \ats are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \ats and switch among \ats liberally without many performance concerns.
 The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}.
+The \CFA M:N threading model is implemented using many user-level threads mapped onto fewer \glspl{kthrd}.
 The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack.
 The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run.
+The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \at until it context switches out, it then chooses a different \at to run.
 \section{Clusters}
 \CFA allows the option to group user-level threading, in the form of clusters.
 Both \glspl{thrd} and \glspl{proc} belong to a specific cluster.
 \Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters.
+Both \ats and \glspl{proc} belong to a specific cluster.
+\Glspl{at} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters.
 Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism.
 It also opens the door to handling effects like NUMA, by pinning clusters to a specific NUMA node\footnote{This capability is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for CPU masks.}.
 …
                 \input{system.pstex_t}
         \end{center}
         \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} that lives outside any cluster and does not run \glspl{thrd}.}
+        \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{at} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} that lives outside any cluster and does not run \ats.}
         \label{fig:system}
 \end{figure}
 …
 \section{\glsxtrshort{io}}\label{prev:io}
 Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. While all \glsxtrshort{io} operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means \glsxtrshort{io} operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
+Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. While all \glsxtrshort{io} operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means \glsxtrshort{io} operations block \glspl{proc} instead of \ats. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \ats. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \at is ready to run. A simple example of this type of deadlock would be as follows:
 \begin{quote}
 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server.
 If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet.
 In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request.
 With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}.
 If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client.
+Given a simple network program with 2 \ats and a single \gls{proc}, one \at sends network requests to a server and the other \at waits for a response from the server.
+If the second \at races ahead, it may wait for responses to requests that have not been sent yet.
+In theory, this should not be a problem, even if the second \at waits, because the first \at is still ready to run and should be able to get CPU time to send the request.
+With M:N threading, while the first \at is ready, the lone \gls{proc} \emph{cannot} run the first \at if it is blocked in the \glsxtrshort{io} operation of the second \at.
+If this happens, the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client.
 However, this solution is neither general nor appropriate even in this simple case.}.
 \end{quote}
 Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} ope      rations.
 This feature entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}.
+Therefore, one of the objectives of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \ats rather than \glspl{proc} when doing \glsxtrshort{io} operations.
+This feature entails multiplexing the \glsxtrshort{io} operations of many \ats onto fewer \glspl{proc}.
 The multiplexing requires a single \gls{proc} to execute multiple \glsxtrshort{io} operations in parallel.
 This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration.
 …
 All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions need not be thread-safe. ... (list of 70+ excluded functions)
 \end{quote}
 Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthreads lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
+Only UNIX @man@ pages identify whether a library function is thread-safe, and hence, may block on a pthreads lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
 Languages like Go and Java, which have strict interoperability with C\cite{wiki:jni,go:cgo}, can control operations in C by ``sandboxing'' them, \eg a blocking function may be delegated to a \gls{kthrd}. Sandboxing may help towards guaranteeing that the kind of deadlock mentioned above does not occur.
 …
 Therefore, it is possible calls to an unknown library function can block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model.
 Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications.
 Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks are redirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.}
+Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks is redirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.}

doc/theses/thierry_delisle_PhD/thesis/thesis.tex

-              rebf8ca5
+              r23a08aa0
 \usepackage{subcaption}
 \usepackage{comment} % Removes large sections of the document.
+\usepackage{array}
 % Hyperlinks make it very easy to navigate an electronic document.
 …
 \newcommand\proc{\gls{proc}\xspace}%
 \newcommand\procs{\glspl{proc}\xspace}%
+\newcommand\park{\glslink{atblock}{park}\xspace}%
+\newcommand\unpark{\glslink{atsched}{unpark}\xspace}%
 %======================================================================

driver/Makefile.am

rebf8ca5	r23a08aa0
19	19
20	20	# applies to both programs
21		AM_CXXFLAGS = @HOST_FLAGS@ -Wall -Wextra -Werror=return-type -O2 -g -std=c++14 -I${abs_top_srcdir}/src -I${abs_top_srcdir}/src/include
	21	AM_CXXFLAGS = @HOST_FLAGS@ -Wall -Wextra -Werror=return-type -O2 -g -std=c++17 -I${abs_top_srcdir}/src -I${abs_top_srcdir}/src/include
22	22
23	23	# don't install cfa directly

driver/cfa.cc

-              rebf8ca5
+              r23a08aa0
         return arg.substr( 0, pre.size() ) == pre;
 } // prefix
-static inline bool ends_with(const string & str, const string & sfix) {
-        if (sfix.size() > str.size()) return false;
-        return std::equal(str.rbegin(), str.rbegin() + sfix.size(), sfix.rbegin(), sfix.rend());
+}
 // check if string has suffix

libcfa/prelude/Makefile.am

-              rebf8ca5
+              r23a08aa0
 prelude.cfa : prelude-gen.cc
         ${AM_V_GEN}${CXX} ${AM_CXXFLAGS} ${CXXFLAGS} ${AM_CFLAGS} ${<} -o prelude-gen -Wall -Wextra -O2 -g -std=c++14
+        ${AM_V_GEN}${CXX} ${AM_CXXFLAGS} ${CXXFLAGS} ${AM_CFLAGS} ${<} -o prelude-gen -Wall -Wextra -O2 -g -std=c++17
         @./prelude-gen > ${@}
         @rm ./prelude-gen
 …
 if ENABLE_DISTCC
 distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c $(srcdir)/../../tools/build/push2dist.sh
+distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ defines.hfa gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c $(srcdir)/../../tools/build/push2dist.sh
         ${AM_V_GEN}$(srcdir)/../../tools/build/push2dist.sh @CFADIR_HASH@ @DIST_BWLIMIT@
         @echo "Dummy file to track distribution to remote hosts" > ${@}

libcfa/prelude/defines.hfa.in

-              rebf8ca5
+              r23a08aa0
 /* Defined if io_uring support is present when compiling libcfathread and
+   supports the flag IORING_REGISTER_IOWQ_MAX_WORKERS. */
+#undef CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS
+/* Defined if io_uring support is present when compiling libcfathread and
    supports the flag IORING_SETUP_ATTACH_WQ. */
 #undef CFA_HAVE_IORING_SETUP_ATTACH_WQ

libcfa/src/Makefile.am

rebf8ca5	r23a08aa0
186	186	if ENABLE_DISTCC
187	187
188		../prelude/distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ ../prelude/gcc-builtins.cf ../prelude/builtins.cf ../prelude/extras.cf ../prelude/prelude.cfa ../prelude/bootloader.c $(srcdir)/../../tools/build/push2dist.sh
	188	../prelude/distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ ../prelude/defines.hfa ../prelude/gcc-builtins.cf ../prelude/builtins.cf ../prelude/extras.cf ../prelude/prelude.cfa ../prelude/bootloader.c $(srcdir)/../../tools/build/push2dist.sh
189	189	@+make -C ../prelude distribution
190	190

libcfa/src/bits/defs.hfa

-              rebf8ca5
+              r23a08aa0
 #define likely(x)   __builtin_expect(!!(x), 1)
 #define unlikely(x) __builtin_expect(!!(x), 0)
-#define thread_local _Thread_local
 typedef void (*fptr_t)();
 …
 #endif
+#if defined(__has_attribute)
+#if !__has_attribute(__noclone__)
+#define ATTRIBUTE_NOCLONE
+#endif
+#endif
+#ifndef ATTRIBUTE_NOCLONE
+#define ATTRIBUTE_NOCLONE __attribute__((__noclone__))
+#endif
 #define libcfa_public __attribute__((visibility("default")))
+#define libcfa_nopreempt __attribute__((section("cfatext_nopreempt"))) __attribute__((__noinline__)) ATTRIBUTE_NOCLONE
+struct __cfa_nopreempt_region {
+        void * start;
+        void * stop;
+};
 #ifdef __cforall

libcfa/src/bits/locks.hfa

-              rebf8ca5
+              r23a08aa0
 // Created On       : Tue Oct 31 15:14:38 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Aug 27 15:06:39 2022
 // Update Count     : 15
+// Last Modified On : Mon Sep 19 18:39:45 2022
+// Update Count     : 16
 //

libcfa/src/concurrency/io/call.cfa.in

-              rebf8ca5
+              r23a08aa0
                 struct io_context$ * ctx = cfa_io_allocate( &sqe, &idx, 1 );
+                memset(sqe, 0, sizeof(*sqe));
                 sqe->opcode = IORING_OP_{op};
+                sqe->flags = sflags;
                 sqe->user_data = (uintptr_t)&future;
+                sqe->flags = sflags;
+                sqe->ioprio = 0;
+                sqe->fd = 0;
+                sqe->off = 0;
+                sqe->addr = 0;
+                sqe->len = 0;
+                sqe->fsync_flags = 0;
+                sqe->__pad2[0] = 0;
+                sqe->__pad2[1] = 0;
+                sqe->__pad2[2] = 0;{body}
+                {body}
                 asm volatile("": : :"memory");

libcfa/src/concurrency/io/setup.cfa

-              rebf8ca5
+              r23a08aa0
                 #if !defined(CFA_WITH_IO_URING_IDLE)
+                {
                         // Step 4 : eventfd
                         __cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd);
 …
                         __cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd);
+                #endif
+                }
+                #endif
+                // TODO: implement a proper version of this.
+                // I have not found a better maximum that works in general but users should be able to configure it
+                // the same way they configure other I/O options
                 // #if defined(CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS)
+                // {
                 //      // Step 5 : max worker count
                 //      __cfadbg_print_safe(io_core, "Kernel I/O : lmiting max workers for ring %d\n", fd);
 …
                 //      __cfadbg_print_safe(io_core, "Kernel I/O : lmited max workers for ring %d\n", fd);
+                // }
                 // #endif

libcfa/src/concurrency/kernel/cluster.hfa

rebf8ca5	r23a08aa0
63	63	}
64	64	}
65		return ~~(max + 2 * max) / 2~~;
	65	return 8 * max;
66	66	}
67	67

libcfa/src/concurrency/kernel/fwd.hfa

-              rebf8ca5
+              r23a08aa0
 extern "C" {
         extern "Cforall" {
                 extern __attribute__((aligned(64))) thread_local struct KernelThreadData {
+                extern __attribute__((aligned(64))) __thread struct KernelThreadData {
                         struct thread$          * volatile this_thread;
                         struct processor        * volatile this_processor;
 …
                 // Similar to a binary semaphore with a 'one shot' semantic
                 // is expected to be discarded after each party call their side
+                enum(struct thread$ *) { oneshot_ARMED = 0p, oneshot_FULFILLED = 1p };
                 struct oneshot {
                         // Internal state :
                         //     0p     : is initial state (wait will block)
                         //     1p     : fulfilled (wait won't block)
+                        // armed      : initial state, wait will block
+                        // fulfilled  : wait won't block
                         // any thread : a thread is currently waiting
                         struct thread$ * volatile ptr;
 …
                 static inline {
                         void  ?{}(oneshot & this) {
                                 this.ptr = 0p;
+                                this.ptr = oneshot_ARMED;
+                        }
 …
                                 for() {
                                         struct thread$ * expected = this.ptr;
                                         if(expected == 1p) return false;
+                                        if(expected == oneshot_FULFILLED) return false;
                                         if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
                                                 park();
                                                 /* paranoid */ verify( this.ptr == 1p );
+                                                /* paranoid */ verify( this.ptr == oneshot_FULFILLED );
                                                 return true;
+                                        }
 …
                         // return true if a thread was unparked
                         thread$ * post(oneshot & this, bool do_unpark = true) {
                                 struct thread$ * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
                                 if( got == 0p || got == 1p ) return 0p;
+                                struct thread$ * got = __atomic_exchange_n( &this.ptr, oneshot_FULFILLED, __ATOMIC_SEQ_CST);
+                                if( got == oneshot_ARMED || got == oneshot_FULFILLED ) return 0p;
                                 if(do_unpark) unpark( got );
                                 return got;
 …
                 // thread on "any of" [a given set of] futures.
                 // does not support multiple threads waiting on the same future
+                enum(struct oneshot *) { future_ARMED = 0p, future_FULFILLED = 1p, future_PROGRESS = 2p, future_ABANDONED = 3p };
                 struct future_t {
                         // Internal state :
                         //     0p      : is initial state (wait will block)
                         //     1p      : fulfilled (wait won't block)
                         //     2p      : in progress ()
                         //     3p      : abandoned, server should delete
+                        // armed       : initial state, wait will block
+                        // fulfilled   : result is ready, wait won't block
+                        // progress    : someone else is in the process of fulfilling this
+                        // abandoned   : client no longer cares, server should delete
                         // any oneshot : a context has been setup to wait, a thread could wait on it
                         struct oneshot * volatile ptr;
 …
                 static inline {
                         void  ?{}(future_t & this) {
                                 this.ptr = 0p;
+                                this.ptr = future_ARMED;
+                        }
 …
                         void reset(future_t & this) {
                                 // needs to be in 0p or 1p
                                 __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+                                __atomic_exchange_n( &this.ptr, future_ARMED, __ATOMIC_SEQ_CST);
+                        }
                         // check if the future is available
                         bool available( future_t & this ) {
                                 while( this.ptr == 2p ) Pause();
                                 return this.ptr == 1p;
+                                while( this.ptr == future_PROGRESS ) Pause();
+                                return this.ptr == future_FULFILLED;
+                        }
 …
                         // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
                         bool setup( future_t & this, oneshot & wait_ctx ) {
                                 /* paranoid */ verify( wait_ctx.ptr == 0p || wait_ctx.ptr == 1p );
+                                /* paranoid */ verify( wait_ctx.ptr == oneshot_ARMED || wait_ctx.ptr == oneshot_FULFILLED );
                                 // The future needs to set the wait context
                                 for() {
                                         struct oneshot * expected = this.ptr;
                                         // Is the future already fulfilled?
                                         if(expected == 1p) return false; // Yes, just return false (didn't block)
+                                        if(expected == future_FULFILLED) return false; // Yes, just return false (didn't block)
                                         // The future is not fulfilled, try to setup the wait context
 …
                                 // attempt to remove the context so it doesn't get consumed.
                                 if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+                                if(__atomic_compare_exchange_n( &this.ptr, &expected, future_ARMED, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
                                         // we still have the original context, then no one else saw it
                                         return false;
+                                }
                                 // expected == 0p: future was never actually setup, just return
                                 if( expected == 0p ) return false;
                                 // expected == 1p: the future is ready and the context was fully consumed
+                                // expected == ARMED: future was never actually setup, just return
+                                if( expected == future_ARMED ) return false;
+                                // expected == FULFILLED: the future is ready and the context was fully consumed
                                 // the server won't use the pointer again
                                 // It is safe to delete (which could happen after the return)
                                 if( expected == 1p ) return true;
                                 // expected == 2p: the future is ready but the context hasn't fully been consumed
+                                if( expected == future_FULFILLED ) return true;
+                                // expected == PROGRESS: the future is ready but the context hasn't fully been consumed
                                 // spin until it is safe to move on
                                 if( expected == 2p ) {
                                         while( this.ptr != 1p ) Pause();
                                         /* paranoid */ verify( this.ptr == 1p );
+                                if( expected == future_PROGRESS ) {
+                                        while( this.ptr != future_FULFILLED ) Pause();
+                                        /* paranoid */ verify( this.ptr == future_FULFILLED );
                                         return true;
+                                }
 …
                         // Mark the future as abandoned, meaning it will be deleted by the server
                         bool abandon( future_t & this ) {
                                 /* paranoid */ verify( this.ptr != 3p );
+                                /* paranoid */ verify( this.ptr != future_ABANDONED );
                                 // Mark the future as abandonned
                                 struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
+                                struct oneshot * got = __atomic_exchange_n( &this.ptr, future_ABANDONED, __ATOMIC_SEQ_CST);
                                 // If the future isn't already fulfilled, let the server delete it
                                 if( got == 0p ) return false;
                                 // got == 2p: the future is ready but the context hasn't fully been consumed
+                                if( got == future_ARMED ) return false;
+                                // got == PROGRESS: the future is ready but the context hasn't fully been consumed
                                 // spin until it is safe to move on
                                 if( got == 2p ) {
                                         while( this.ptr != 1p ) Pause();
                                         got = 1p;
+                                if( got == future_PROGRESS ) {
+                                        while( this.ptr != future_FULFILLED ) Pause();
+                                        got = future_FULFILLED;
+                                }
                                 // The future is completed delete it now
                                 /* paranoid */ verify( this.ptr != 1p );
+                                /* paranoid */ verify( this.ptr != future_FULFILLED );
                                 free( &this );
                                 return true;
 …
                                                 #pragma GCC diagnostic ignored "-Wfree-nonheap-object"
                                         #endif
                                                 if( expected == 3p ) { free( &this ); return 0p; }
+                                                if( expected == future_ABANDONED ) { free( &this ); return 0p; }
                                         #if defined(__GNUC__) && __GNUC__ >= 7
                                                 #pragma GCC diagnostic pop
                                         #endif
                                         /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
                                         /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
+                                        /* paranoid */ verify( expected != future_FULFILLED ); // Future is already fulfilled, should not happen
+                                        /* paranoid */ verify( expected != future_PROGRESS ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
                                         // If there is a wait context, we need to consume it and mark it as consumed after
                                         // If there is no context then we can skip the in progress phase
                                         struct oneshot * want = expected == 0p ? 1p : 2p;
+                                        struct oneshot * want = expected == future_ARMED ? future_FULFILLED : future_PROGRESS;
                                         if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
                                                 if( expected == 0p ) { return 0p; }
+                                                if( expected == future_ARMED ) { return 0p; }
                                                 thread$ * ret = post( *expected, do_unpark );
                                                 __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+                                                __atomic_store_n( &this.ptr, future_FULFILLED, __ATOMIC_SEQ_CST);
                                                 return ret;
+                                        }
 …
                                 // Wait for the future to tru
                                 while( this.ptr == 2p ) Pause();
+                                while( this.ptr == future_PROGRESS ) Pause();
                                 // Make sure the state makes sense
                                 // Should be fulfilled, could be in progress but it's out of date if so
 …
                                 // and the oneshot should not be needed any more
                                 __attribute__((unused)) struct oneshot * was = this.ptr;
                                 /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
+                                /* paranoid */ verifyf( was == future_FULFILLED, "Expected this.ptr to be 1p, was %p\n", was );
                                 // Mark the future as fulfilled, to be consistent

libcfa/src/concurrency/kernel/private.hfa

-              rebf8ca5
+              r23a08aa0
 #elif defined(CFA_HAVE_LINUX_RSEQ_H)
         extern "Cforall" {
                 extern __attribute__((aligned(64))) thread_local volatile struct rseq __cfaabi_rseq;
+                extern __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq;
+        }
 #else
 …
 // Blocking acquire
 static inline void __atomic_acquire(volatile bool * ll) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify(ll);
         while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) {
                 while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED))
 …
+        }
         /* paranoid */ verify(*ll);
+        /* paranoid */ verify( ! __preemption_enabled() );
+}
 // Non-Blocking acquire
 static inline bool __atomic_try_acquire(volatile bool * ll) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify(ll);
         return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST);
+}
 …
 // Release
 static inline void __atomic_unlock(volatile bool * ll) {
+        /* paranoid */ verify( ! __preemption_enabled() );
+        /* paranoid */ verify(ll);
         /* paranoid */ verify(*ll);
         __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE);

libcfa/src/concurrency/kernel/startup.cfa

-              rebf8ca5
+              r23a08aa0
 //-----------------------------------------------------------------------------
 // Global state
 thread_local struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {
+__thread struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {
         NULL,                                                                                           // cannot use 0p
         NULL,
 …
 #elif defined(CFA_HAVE_LINUX_RSEQ_H)
         extern "Cforall" {
                 __attribute__((aligned(64))) thread_local volatile struct rseq __cfaabi_rseq @= {
+                __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq @= {
                         .cpu_id : RSEQ_CPU_ID_UNINITIALIZED,
                 };

libcfa/src/concurrency/preemption.cfa

-              rebf8ca5
+              r23a08aa0
 //----------
 // special case for preemption since used often
 __attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() libcfa_public {
+__attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() libcfa_nopreempt libcfa_public {
         // create a assembler label before
         // marked as clobber all to avoid movement
 …
+}
+extern "C" {
+        __attribute__((visibility("hidden"))) extern void * const __start_cfatext_nopreempt;
+        __attribute__((visibility("hidden"))) extern void * const __stop_cfatext_nopreempt;
+        extern const __cfa_nopreempt_region __libcfa_nopreempt;
+        __attribute__((visibility("protected"))) const __cfa_nopreempt_region __libcfathrd_nopreempt @= {
+                (void * const)&__start_cfatext_nopreempt,
+                (void * const)&__stop_cfatext_nopreempt
+        };
+}
+static inline bool __cfaabi_in( void * const ip, const struct __cfa_nopreempt_region & const region ) {
+        return ip >= region.start && ip <= region.stop;
+}
 //----------
 // Get data from the TLS block
 // struct asm_region __cfaasm_get;
 uintptr_t __cfatls_get( unsigned long int offset ) __attribute__((__noinline__, visibility("default"))); //no inline to avoid problems
+uintptr_t __cfatls_get( unsigned long int offset ) libcfa_nopreempt libcfa_public; //no inline to avoid problems
 uintptr_t __cfatls_get( unsigned long int offset ) {
         // create a assembler label before
 …
 extern "C" {
         // Disable interrupts by incrementing the counter
         __attribute__((__noinline__, visibility("default"))) void disable_interrupts() libcfa_public {
+        void disable_interrupts() libcfa_nopreempt libcfa_public {
                 // create a assembler label before
                 // marked as clobber all to avoid movement
 …
         // Enable interrupts by decrementing the counter
         // If counter reaches 0, execute any pending __cfactx_switch
         void enable_interrupts( bool poll ) libcfa_public {
+        void enable_interrupts( bool poll ) libcfa_nopreempt libcfa_public {
                 // Cache the processor now since interrupts can start happening after the atomic store
                 processor   * proc = __cfaabi_tls.this_processor;
 …
+                }
+        }
+        // Check whether or not there is pending preemption
+        // force_yield( __POLL_PREEMPTION ) if appropriate
+        // return true if the thread was in an interruptable state
+        // i.e. on a real processor and not in the kernel
+        // (can return true even if no preemption was pending)
+        bool poll_interrupts() libcfa_public {
+                // Cache the processor now since interrupts can start happening after the atomic store
+                processor   * proc = publicTLS_get( this_processor );
+                if ( ! proc ) return false;
+                if ( ! __preemption_enabled() ) return false;
+                with( __cfaabi_tls.preemption_state ){
+                        // Signal the compiler that a fence is needed but only for signal handlers
+                        __atomic_signal_fence(__ATOMIC_RELEASE);
+                        if( proc->pending_preemption ) {
+                                proc->pending_preemption = false;
+                                force_yield( __POLL_PREEMPTION );
+                        }
+                }
+                return true;
+        }
+}
 …
 //-----------------------------------------------------------------------------
-// Some assembly required
-#if defined( __i386 )
-        #ifdef __PIC__
-                #define RELOC_PRELUDE( label ) \
-                        "calll   .Lcfaasm_prelude_" #label "$pb\n\t" \
-                        ".Lcfaasm_prelude_" #label "$pb:\n\t" \
-                        "popl    %%eax\n\t" \
-                        ".Lcfaasm_prelude_" #label "_end:\n\t" \
-                        "addl    $_GLOBAL_OFFSET_TABLE_+(.Lcfaasm_prelude_" #label "_end-.Lcfaasm_prelude_" #label "$pb), %%eax\n\t"
-                #define RELOC_PREFIX ""
-                #define RELOC_SUFFIX "@GOT(%%eax)"
-        #else
-                #define RELOC_PREFIX "$"
-                #define RELOC_SUFFIX ""
-        #endif
-        #define __cfaasm_label( label ) struct asm_region label = \
-                ({ \
-                        struct asm_region region; \
-                        asm( \
-                                RELOC_PRELUDE( label ) \
-                                "movl " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
-                                "movl " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
-                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
-                        ); \
-                        region; \
-                });
-#elif defined( __x86_64 )
-        #ifdef __PIC__
-                #define RELOC_PREFIX ""
-                #define RELOC_SUFFIX "@GOTPCREL(%%rip)"
-        #else
-                #define RELOC_PREFIX "$"
-                #define RELOC_SUFFIX ""
-        #endif
-        #define __cfaasm_label( label ) struct asm_region label = \
-                ({ \
-                        struct asm_region region; \
-                        asm( \
-                                "movq " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \
-                                "movq " RELOC_PREFIX "__cfaasm_" #label "_after"  RELOC_SUFFIX ", %[va]\n\t" \
-                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
-                        ); \
-                        region; \
-                });
-#elif defined( __aarch64__ )
-        #ifdef __PIC__
-                // Note that this works only for gcc
-                #define __cfaasm_label( label ) struct asm_region label = \
-                ({ \
-                        struct asm_region region; \
-                        asm( \
-                                "adrp %[vb], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
-                                "ldr  %[vb], [%[vb], #:gotpage_lo15:__cfaasm_" #label "_before]" "\n\t" \
-                                "adrp %[va], _GLOBAL_OFFSET_TABLE_"                              "\n\t" \
-                                "ldr  %[va], [%[va], #:gotpage_lo15:__cfaasm_" #label "_after]"  "\n\t" \
-                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
-                        ); \
-                        region; \
-                });
-        #else
-                #error this is not the right thing to do
-                /*
-                #define __cfaasm_label( label ) struct asm_region label = \
-                ({ \
-                        struct asm_region region; \
-                        asm( \
-                                "adrp %[vb], __cfaasm_" #label "_before"              "\n\t" \
-                                "add  %[vb], %[vb], :lo12:__cfaasm_" #label "_before" "\n\t" \
-                                "adrp %[va], :got:__cfaasm_" #label "_after"          "\n\t" \
-                                "add  %[va], %[va], :lo12:__cfaasm_" #label "_after"  "\n\t" \
-                                 : [vb]"=r"(region.before), [va]"=r"(region.after) \
-                        ); \
-                        region; \
-                });
-                */
-        #endif
-#else
-        #error unknown hardware architecture
-#endif
 // KERNEL ONLY
 // Check if a __cfactx_switch signal handler shoud defer
 …
 // If false : preemption is unsafe and marked as pending
 static inline bool preemption_ready( void * ip ) {
-        // Get all the region for which it is not safe to preempt
-        __cfaasm_label( get    );
-        __cfaasm_label( check  );
-        __cfaasm_label( dsable );
-        // __cfaasm_label( debug  );
         // Check if preemption is safe
         bool ready = true;
+        if( __cfaasm_in( ip, get    ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, check  ) ) { ready = false; goto EXIT; };
+        if( __cfaasm_in( ip, dsable ) ) { ready = false; goto EXIT; };
+        // if( __cfaasm_in( ip, debug  ) ) { ready = false; goto EXIT; };
+        if( __cfaabi_in( ip, __libcfa_nopreempt ) ) { ready = false; goto EXIT; };
+        if( __cfaabi_in( ip, __libcfathrd_nopreempt ) ) { ready = false; goto EXIT; };
         if( !__cfaabi_tls.preemption_state.enabled) { ready = false; goto EXIT; };
         if( __cfaabi_tls.preemption_state.in_progress ) { ready = false; goto EXIT; };
 …
 // Kernel Signal Handlers
 //=============================================================================================
 __cfaabi_dbg_debug_do( static thread_local void * last_interrupt = 0; )
+__cfaabi_dbg_debug_do( static __thread void * last_interrupt = 0; )
 // Context switch signal handler

libcfa/src/startup.cfa

-              rebf8ca5
+              r23a08aa0
         } // __cfaabi_appready_shutdown
+        void disable_interrupts() __attribute__(( weak )) libcfa_public {}
+        void enable_interrupts() __attribute__(( weak )) libcfa_public {}
+        void disable_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public {}
+        void enable_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public {}
+        bool poll_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public { return false; }
+        __attribute__((visibility("hidden"))) extern void * const __start_cfatext_nopreempt;
+        __attribute__((visibility("hidden"))) extern void * const __stop_cfatext_nopreempt;
+        __attribute__((visibility("protected"))) const __cfa_nopreempt_region __libcfa_nopreempt @= {
+                (void * const)&__start_cfatext_nopreempt,
+                (void * const)&__stop_cfatext_nopreempt
+        };

src/AST/Decl.hpp

rebf8ca5	r23a08aa0
217	217
218	218	/// convenience accessor to match Type::isComplete()
219		bool isComplete() { return sized; }
	219	bool isComplete() const { return sized; }
220	220
221	221	const Decl * accept( Visitor & v ) const override { return v.visit( this ); }

src/AST/DeclReplacer.cpp

-              rebf8ca5
+              r23a08aa0
 // Author           : Aaron B. Moss
 // Created On       : Wed May 8 13:00:00 2019
 // Last Modified By : Aaron B. Moss
 // Last Modified On : Wed May 8 13:00:00 2019
 // Update Count     : 1
+// Last Modified By : Andrew Beach
+// Last Modified On : Thr Sep 15 11:55:00 2022
+// Update Count     : 2
 //
 #include "DeclReplacer.hpp"
 #include "Expr.hpp"
+#include "Pass.hpp"
 #include "Type.hpp"
-#include "Pass.hpp"
 namespace ast {
 namespace DeclReplacer {
-        namespace {
-                struct DeclReplacer {
-                private:
-                        const DeclMap & declMap;
-                        const TypeMap & typeMap;
-                        bool debug;
+                public:
+                        DeclReplacer(const DeclMap & declMap, const TypeMap & typeMap, bool debug)
+                                : declMap( declMap ), typeMap( typeMap ), debug( debug )
+                        {}
+namespace {
+        struct DeclReplacer {
+        private:
+                const DeclMap & declMap;
+                const TypeMap & typeMap;
+                bool debug;
+                        const ast::VariableExpr * previsit( const ast::VariableExpr * );
+                        const ast::TypeInstType * previsit( const ast::TypeInstType * );
+                };
+        public:
+                DeclReplacer( const DeclMap & declMap, const TypeMap & typeMap, bool debug )
+                        : declMap( declMap ), typeMap( typeMap ), debug( debug )
+                {}
+                struct VarExprReplacer {
+                private:
+                        const ExprMap & exprMap;
+                public:
+                        VarExprReplacer(const ExprMap & exprMap): exprMap (exprMap) {}
+                const ast::VariableExpr * previsit( const ast::VariableExpr * );
+                const ast::TypeInstType * previsit( const ast::TypeInstType * );
+        };
+                        const Expr * postvisit (const VariableExpr *);
+                };
+        struct VarExprReplacer {
+        private:
+                const ExprMap & exprMap;
+        public:
+                VarExprReplacer( const ExprMap & exprMap ) : exprMap( exprMap ) {}
+                const Expr * postvisit( const VariableExpr * );
+        };
+} // namespace
+const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, const TypeMap & typeMap, bool debug ) {
+        if(!node) return nullptr;
+        Pass<DeclReplacer> replacer = { declMap, typeMap, debug };
+        return node->accept( replacer );
+}
+const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, bool debug ) {
+        TypeMap typeMap;
+        return replace( node, declMap, typeMap, debug );
+}
+const ast::Node * replace( const ast::Node * node, const TypeMap & typeMap, bool debug ) {
+        DeclMap declMap;
+        return replace( node, declMap, typeMap, debug );
+}
+const ast::Node * replace( const ast::Node * node, const ExprMap & exprMap ) {
+        Pass<VarExprReplacer> replacer = {exprMap};
+        return node->accept( replacer );
+}
+namespace {
+        // replace variable with new node from decl map
+        const ast::VariableExpr * DeclReplacer::previsit( const VariableExpr * varExpr ) {
+                // xxx - assertions and parameters aren't accounted for in this... (i.e. they aren't inserted into the map when it's made, only DeclStmts are)
+                if ( !declMap.count( varExpr->var ) ) return varExpr;
+                auto replacement = declMap.at( varExpr->var );
+                if ( debug ) {
+                        std::cerr << "replacing variable reference: "
+                                << (void*)varExpr->var.get() << " " << varExpr->var
+                                << " with " << (void*)replacement << " " << replacement
+                                << std::endl;
+                }
+                auto nexpr = mutate(varExpr);
+                nexpr->var = replacement;
+                return nexpr;
+        }
+        const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, const TypeMap & typeMap, bool debug ) {
+                if(!node) return nullptr;
+                Pass<DeclReplacer> replacer = { declMap, typeMap, debug };
+                return node->accept( replacer );
+        const TypeInstType * DeclReplacer::previsit( const TypeInstType * inst ) {
+                if ( !typeMap.count( inst->base ) ) return inst;
+                auto replacement = typeMap.at( inst->base );
+                if ( debug ) {
+                        std::cerr << "replacing type reference: "
+                                << (void*)inst->base.get() << " " << inst->base
+                                << " with " << (void*)replacement << " " << replacement
+                                << std::endl;
+                }
+                auto ninst = mutate(inst);
+                ninst->base = replacement;
+                return ninst;
+        }
         const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, bool debug ) {
                 TypeMap typeMap;
                 return replace( node, declMap, typeMap, debug );
+        const Expr * VarExprReplacer::postvisit( const VariableExpr * expr ) {
+                if ( !exprMap.count( expr->var ) ) return expr;
+                return exprMap.at( expr->var );
+        }
+} // namespace
+        const ast::Node * replace( const ast::Node * node, const TypeMap & typeMap, bool debug ) {
+                DeclMap declMap;
+                return replace( node, declMap, typeMap, debug );
+        }
+} // namespace DeclReplacer
+        const ast::Node * replace( const ast::Node * node, const ExprMap & exprMap) {
+                Pass<VarExprReplacer> replacer = {exprMap};
+                return node->accept( replacer );
+        }
+        namespace {
+                // replace variable with new node from decl map
+                const ast::VariableExpr * DeclReplacer::previsit( const VariableExpr * varExpr ) {
+                        // xxx - assertions and parameters aren't accounted for in this... (i.e. they aren't inserted into the map when it's made, only DeclStmts are)
+                        if ( !declMap.count( varExpr->var ) ) return varExpr;
+                        auto replacement = declMap.at( varExpr->var );
+                        if ( debug ) {
+                                std::cerr << "replacing variable reference: "
+                                        << (void*)varExpr->var.get() << " " << varExpr->var
+                                        << " with " << (void*)replacement << " " << replacement
+                                        << std::endl;
+                        }
+                        auto nexpr = mutate(varExpr);
+                        nexpr->var = replacement;
+                        return nexpr;
+                }
+                const TypeInstType * DeclReplacer::previsit( const TypeInstType * inst ) {
+                        if ( !typeMap.count( inst->base ) ) return inst;
+                        auto replacement = typeMap.at( inst->base );
+                        if ( debug ) {
+                                std::cerr << "replacing type reference: "
+                                        << (void*)inst->base.get() << " " << inst->base
+                                        << " with " << (void*)replacement << " " << replacement
+                                        << std::endl;
+                        }
+                        auto ninst = mutate(inst);
+                        ninst->base = replacement;
+                        return ninst;
+                }
+                const Expr * VarExprReplacer::postvisit( const VariableExpr * expr ) {
+                        if (!exprMap.count(expr->var)) return expr;
+                        return exprMap.at(expr->var);
+                }
+        }
+}
+}
+} // namespace ast
 // Local Variables: //

src/AST/Pass.hpp

-              rebf8ca5
+              r23a08aa0
 struct PureVisitor {};
+struct WithCodeLocation {
+        const CodeLocation * location = nullptr;
+};
 /// Keep track of the polymorphic const TypeSubstitution * typeSubs for the current expression.
 struct WithConstTypeSubstitution {

src/AST/Pass.impl.hpp

rebf8ca5	r23a08aa0
25	25	#define VISIT_START( node ) \
26	26	using namespace ast; \
	27	/* back-up the last known code location */ \
	28	__attribute__((unused)) auto loc_guard = ast::__pass::make_location_guard( core, node, 0 ); \
27	29	/* back-up the visit children */ \
28	30	__attribute__((unused)) ast::__pass::visit_children_guard guard1( ast::__pass::visit_children(core, 0) ); \

src/AST/Pass.proto.hpp

-              rebf8ca5
+              r23a08aa0
+        }
+        template< typename core_t, typename node_t >
+        static auto make_location_guard( core_t & core, node_t * node, int )
+                        -> decltype( node->location, ValueGuardPtr<const CodeLocation *>( &core.location ) ) {
+                ValueGuardPtr<const CodeLocation *> guard( &core.location );
+                core.location = &node->location;
+                return guard;
+        }
+        template< typename core_t, typename node_t >
+        static auto make_location_guard( core_t &, node_t *, long ) -> int {
+                return 0;
+        }
         // Another feature of the templated visitor is that it calls beginScope()/endScope() for compound statement.
         // All passes which have such functions are assumed desire this behaviour

src/AST/Print.cpp

-              rebf8ca5
+              r23a08aa0
+{
         return array<C,sizeof...(T)>{
                 forward<T>(values)...
+                std::forward<T>(values)...
         };
+}
 …
                 static constexpr auto StorageClasses = make_array<const char*>(
                         "extern", "static", "auto", "register", "_Thread_local"
+                        "extern", "static", "auto", "register", "__thread", "_Thread_local"
                 );
 …
                         ++indent;
                         ptrToEnum->base->accept( *this );
                         --indent;
+                        --indent;
+                }
 …
 // if the wrong size is specified
 constexpr array<const char*, 3> Printer::Names::FuncSpecifiers;
 constexpr array<const char*, 5> Printer::Names::StorageClasses;
+constexpr array<const char*, 6> Printer::Names::StorageClasses;
 constexpr array<const char*, 6> Printer::Names::Qualifiers;
+}

src/AST/StorageClasses.hpp

-              rebf8ca5
+              r23a08aa0
         /// Bitflags for storage classes
         enum {
+                Extern      = 1 << 0,
+                Static      = 1 << 1,
+                Auto        = 1 << 2,
+                Register    = 1 << 3,
+                ThreadLocal = 1 << 4,
+                NumClasses       = 5
+                Extern         = 1 << 0,
+                Static         = 1 << 1,
+                Auto           = 1 << 2,
+                Register       = 1 << 3,
+                ThreadLocalGcc = 1 << 4,
+                ThreadLocalC11 = 1 << 5,
+                NumClasses          = 6
         };
 …
                         unsigned int val;
                         struct {
+                                bool is_extern      : 1;
+                                bool is_static      : 1;
+                                bool is_auto        : 1;
+                                bool is_register    : 1;
+                                bool is_threadlocal : 1;
+                                bool is_extern         : 1;
+                                bool is_static         : 1;
+                                bool is_auto           : 1;
+                                bool is_register       : 1;
+                                bool is_threadlocalGcc : 1;
+                                bool is_threadlocalC11 : 1;
                         };
 …
                 constexpr class_flags( unsigned int val = 0 ) : val(val) {}
+                bool is_threadlocal_any() { return this->is_threadlocalC11 || this->is_threadlocalGcc; }
         };

src/AST/Type.cpp

-              rebf8ca5
+              r23a08aa0
 TraitInstType::TraitInstType(
         const TraitDecl * b, CV::Qualifiers q, std::vector<ptr<Attribute>>&& as )
 : BaseInstType( b->name, q, move(as) ), base( b ) {}
+: BaseInstType( b->name, q, std::move(as) ), base( b ) {}
 // --- TypeInstType
 …
 TypeInstType::TypeInstType( const TypeDecl * b,
         CV::Qualifiers q, std::vector<ptr<Attribute>> && as )
 : BaseInstType( b->name, q, move(as) ), base( b ), kind( b->kind ) {}
+: BaseInstType( b->name, q, std::move(as) ), base( b ), kind( b->kind ) {}
 void TypeInstType::set_base( const TypeDecl * b ) {
 …
 TupleType::TupleType( std::vector<ptr<Type>> && ts, CV::Qualifiers q )
 : Type( q ), types( move(ts) ), members() {
+: Type( q ), types( std::move(ts) ), members() {
         // This constructor is awkward. `TupleType` needs to contain objects so that members can be
         // named, but members without initializer nodes end up getting constructors, which breaks

src/AST/Type.hpp

-              rebf8ca5
+              r23a08aa0
 template< enum Node::ref_type ref_t >
 void reset_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q = {} ) {
         if ( p->qualifiers.val != q.val ) p.get_and_mutate()->qualifiers = q;
+        if ( p->qualifiers != q ) p.get_and_mutate()->qualifiers = q;
+}
 …
 template< enum Node::ref_type ref_t >
 void add_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q ) {
         if ( ( p->qualifiers.val & q.val ) != q.val ) p.get_and_mutate()->qualifiers |= q;
+        if ( ( p->qualifiers & q ) != q ) p.get_and_mutate()->qualifiers |= q;
+}
 …
 template< enum Node::ref_type ref_t >
 void remove_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q ) {
         if ( ( p->qualifiers.val & q.val ) != 0 ) p.get_and_mutate()->qualifiers -= q;
+        if ( ( p->qualifiers & q ) != 0 ) p.get_and_mutate()->qualifiers -= q;
+}
 …
                 std::string typeString() const { return std::string("_") + std::to_string(formal_usage) + "_" + std::to_string(expr_id) + "_" + base->name; }
                 bool operator==(const TypeEnvKey & other) const { return base == other.base && formal_usage == other.formal_usage && expr_id == other.expr_id; }
         };

src/CodeGen/CodeGenerator.cc

-              rebf8ca5
+              r23a08aa0
                                         assert( false );
                                 } // switch
+                        } else if( varExpr->get_var()->get_linkage() == LinkageSpec::BuiltinCFA && varExpr->get_var()->get_name() == "intptr" ) {
+                                // THIS is a hack to make it a constant until a proper constexpr solution is created
+                                output << "((void*)";
+                                std::list< Expression* >::iterator arg = applicationExpr->get_args().begin();
+                                (*arg++)->accept( *visitor );
+                                output << ")";
                         } else {
                                 varExpr->accept( *visitor );

src/Common/utility.h

rebf8ca5	r23a08aa0
322	322
323	323	ValueGuardPtr(T * inRef) : old( inRef ? *inRef : T() ), ref(inRef) {}
	324	ValueGuardPtr(const ValueGuardPtr& other) = delete;
	325	ValueGuardPtr(ValueGuardPtr&& other) : old(other.old), ref(other.ref) { other.ref = nullptr; }
324	326	~ValueGuardPtr() { if( ref ) *ref = old; }
325	327	};

src/CompilationState.cc

rebf8ca5	r23a08aa0
31	31	genproto = false,
32	32	deterministic_output = false,
33		useNewAST = ~~CFA_USE_NEW_AST~~,
	33	useNewAST = true,
34	34	nomainp = false,
35	35	parsep = false,

src/Concurrency/Keywords.cc

-              rebf8ca5
+              r23a08aa0
                 ObjectDecl * vtable_object = Virtual::makeVtableForward(
                         "_default_vtable_object_declaration",
                         vtable_decl->makeInst( move( poly_args ) ) );
+                        vtable_decl->makeInst( std::move( poly_args ) ) );
                 declsToAddBefore.push_back( vtable_object );
                 declsToAddBefore.push_back(
 …
                         void lock (monitor_t & this) {
                                 lock(get_monitor(this));
+                        }
+                        }
                 */
                 FunctionDecl * lock_decl = new FunctionDecl(
 …
                 CompoundStmt * lock_statement = new CompoundStmt();
                 lock_statement->push_back(
                         new ExprStmt(
+                        new ExprStmt(
                                 new UntypedExpr (
                                         new NameExpr( "lock" ),
 …
                         void unlock (monitor_t & this) {
                                 unlock(get_monitor(this));
+                        }
+                        }
                 */
                 FunctionDecl * unlock_decl = new FunctionDecl(
 …
                 unlock_statement->push_back(
                         new ExprStmt(
+                        new ExprStmt(
                                 new UntypedExpr(
                                         new NameExpr( "unlock" ),
 …
                 );
                 unlock_decl->set_statements( unlock_statement );
                 // pushes routines to declsToAddAfter to add at a later time
                 declsToAddAfter.push_back( lock_decl );
 …
                         assert( !thread_guard_decl );
                         thread_guard_decl = decl;
+                }
+                }
                 else if ( decl->name == "__mutex_stmt_lock_guard" && decl->body ) {
                         assert( !lock_guard_decl );
 …
                                                         new NameExpr( "__get_mutexstmt_lock_type" ),
                                                         { args.front()->clone() }
+                                                )
+                                                )
+                                        )
                                 ),
 …
                 StructInstType * lock_guard_struct = new StructInstType( noQualifiers, lock_guard_decl );
                 TypeExpr * lock_type_expr = new TypeExpr(
+                TypeExpr * lock_type_expr = new TypeExpr(
                         new TypeofType( noQualifiers, new UntypedExpr(
                                 new NameExpr( "__get_mutexstmt_lock_type" ),
                                 { args.front()->clone() }
+                                )
+                        )
+                                )
+                        )
                 );

src/Concurrency/Waitfor.cc

rebf8ca5	r23a08aa0
402	402
403	403	clause.target.function = nullptr;
404		clause.target.arguments.~~empty~~();
	404	clause.target.arguments.clear();
405	405	clause.condition = nullptr;
406	406	}

src/Concurrency/WaitforNew.cpp

rebf8ca5	r23a08aa0
101	101	namespace {
102	102
103		class GenerateWaitForCore :
	103	class GenerateWaitForCore final :
104	104	public ast::WithSymbolTable, public ast::WithConstTranslationUnit {
105	105	const ast::FunctionDecl * decl_waitfor = nullptr;

src/ControlStruct/ExceptTranslateNew.cpp

-              rebf8ca5
+              r23a08aa0
+        }
 class TranslateThrowsCore : public ast::WithGuards {
+class TranslateThrowsCore final : public ast::WithGuards {
         const ast::ObjectDecl * terminateHandlerExcept;
         enum Context { NoHandler, TerHandler, ResHandler } currentContext;
 …
 class TryMutatorCore {
+class TryMutatorCore final {
         // The built in types used in translation.
         const ast::StructDecl * except_decl;

src/ControlStruct/LabelFixer.cc

rebf8ca5	r23a08aa0
119	119
120	120	// Builds a table that maps a label to its defining statement.
121		std::map<Label, Statement * > * LabelFixer::resolveJumps() ~~throw ( SemanticErrorException )~~ {
	121	std::map<Label, Statement * > * LabelFixer::resolveJumps() {
122	122	std::map< Label, Statement * > ret = new std::map< Label, Statement >();
123	123	for ( std::map< Label, Entry * >::iterator i = labelTable.begin(); i != labelTable.end(); ++i ) {

src/ControlStruct/LabelFixer.h

rebf8ca5	r23a08aa0
33	33	LabelFixer( LabelGenerator *gen = 0 );
34	34
35		std::map < Label, Statement * > *resolveJumps() ~~throw ( SemanticErrorException )~~;
	35	std::map < Label, Statement * > *resolveJumps();
36	36
37	37	// Declarations

src/ControlStruct/MLEMutator.cc

-              rebf8ca5
+              r23a08aa0
+        Statement *MultiLevelExitMutator::postmutate( BranchStmt *branchStmt )
+                        throw ( SemanticErrorException ) {
+        Statement *MultiLevelExitMutator::postmutate( BranchStmt *branchStmt ) {
                 std::string originalTarget = branchStmt->originalTarget;

src/ControlStruct/MLEMutator.h

rebf8ca5	r23a08aa0
41	41
42	42	void premutate( CompoundStmt *cmpndStmt );
43		Statement * postmutate( BranchStmt *branchStmt ) ~~throw ( SemanticErrorException )~~;
	43	Statement * postmutate( BranchStmt *branchStmt );
44	44	void premutate( WhileDoStmt *whileDoStmt );
45	45	Statement * postmutate( WhileDoStmt *whileDoStmt );

src/GenPoly/GenPoly.cc

-              rebf8ca5
+              r23a08aa0
 // Author           : Richard C. Bilson
 // Created On       : Mon May 18 07:44:20 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Jun 29 21:45:53 2016
 // Update Count     : 14
+// Last Modified By : Andrew Beach
+// Last Modified On : Wed Sep 14  9:24:00 2022
+// Update Count     : 15
 //
 …
+                }
+                bool hasDynParams( const std::vector<ast::ptr<ast::Expr>> & params, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs ) {
+                        for ( ast::ptr<ast::Expr> const & param : params ) {
+                                auto paramType = param.as<ast::TypeExpr>();
+                                assertf( paramType, "Aggregate parameters should be type expressions." );
+                                if ( isDynType( paramType->type, tyVars, typeSubs ) ) {
+                                        return true;
+                                }
+                        }
+                        return false;
+                }
                 /// Checks a parameter list for inclusion of polymorphic parameters; will substitute according to env if present
                 bool includesPolyParams( std::list< Expression* >& params, const TypeSubstitution *env ) {
 …
+                }
                 return 0;
+        }
+        const ast::BaseInstType *isDynType( const ast::Type *type, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs ) {
+                type = replaceTypeInst( type, typeSubs );
+                if ( auto inst = dynamic_cast<ast::TypeInstType const *>( type ) ) {
+                        auto var = tyVars.find( inst->name );
+                        if ( var != tyVars.end() && var->second.isComplete ) {
+                                return inst;
+                        }
+                } else if ( auto inst = dynamic_cast<ast::StructInstType const *>( type ) ) {
+                        if ( hasDynParams( inst->params, tyVars, typeSubs ) ) {
+                                return inst;
+                        }
+                } else if ( auto inst = dynamic_cast<ast::UnionInstType const *>( type ) ) {
+                        if ( hasDynParams( inst->params, tyVars, typeSubs ) ) {
+                                return inst;
+                        }
+                }
+                return nullptr;
+        }
 …
                 inline D* as( B* p ) { return reinterpret_cast<D*>(p); }
+                template<typename D, typename B>
+                inline D const * as( B const * p ) {
+                        return reinterpret_cast<D const *>( p );
+                }
                 /// Flattens a declaration list
                 template<typename Output>
 …
                         for ( Type* ty : src ) {
                                 ResolvExpr::flatten( ty, out );
+                        }
+                }
+                void flattenList( vector<ast::ptr<ast::Type>> const & src,
+                                vector<ast::ptr<ast::Type>> & out ) {
+                        for ( auto const & type : src ) {
+                                ResolvExpr::flatten( type, out );
+                        }
+                }
 …
                                 // if ( is<VoidType>( aparam->get_type() ) || is<VoidType>( bparam->get_type() ) ) continue;
                                 if ( ! typesPolyCompatible( aparam->get_type(), bparam->get_type() ) ) return false;
+                        }
+                        return true;
+                }
+                bool paramListsPolyCompatible(
+                                std::vector<ast::ptr<ast::Expr>> const & lparams,
+                                std::vector<ast::ptr<ast::Expr>> const & rparams ) {
+                        if ( lparams.size() != rparams.size() ) {
+                                return false;
+                        }
+                        for ( auto lparam = lparams.begin(), rparam = rparams.begin() ;
+                                        lparam != lparams.end() ; ++lparam, ++rparam ) {
+                                ast::TypeExpr const * lexpr = lparam->as<ast::TypeExpr>();
+                                assertf( lexpr, "Aggregate parameters should be type expressions" );
+                                ast::TypeExpr const * rexpr = rparam->as<ast::TypeExpr>();
+                                assertf( rexpr, "Aggregate parameters should be type expressions" );
+                                // xxx - might need to let VoidType be a wildcard here too; could have some voids
+                                // stuffed in for dtype-statics.
+                                // if ( is<VoidType>( lexpr->type() ) || is<VoidType>( bparam->get_type() ) ) continue;
+                                if ( !typesPolyCompatible( lexpr->type, rexpr->type ) ) {
+                                        return false;
+                                }
+                        }
 …
+        }
+bool typesPolyCompatible( ast::Type const * lhs, ast::Type const * rhs ) {
+        type_index const lid = typeid(*lhs);
+        // Polymorphic types always match:
+        if ( type_index(typeid(ast::TypeInstType)) == lid ) return true;
+        type_index const rid = typeid(*rhs);
+        if ( type_index(typeid(ast::TypeInstType)) == rid ) return true;
+        // All other types only match if they are the same type:
+        if ( lid != rid ) return false;
+        // So remaining types can be examined case by case.
+        // Recurse through type structure (conditions borrowed from Unify.cc).
+        if ( type_index(typeid(ast::BasicType)) == lid ) {
+                return as<ast::BasicType>(lhs)->kind == as<ast::BasicType>(rhs)->kind;
+        } else if ( type_index(typeid(ast::PointerType)) == lid ) {
+                ast::PointerType const * l = as<ast::PointerType>(lhs);
+                ast::PointerType const * r = as<ast::PointerType>(rhs);
+                // void pointers should match any other pointer type.
+                return is<ast::VoidType>( l->base.get() )
+                        || is<ast::VoidType>( r->base.get() )
+                        || typesPolyCompatible( l->base.get(), r->base.get() );
+        } else if ( type_index(typeid(ast::ReferenceType)) == lid ) {
+                ast::ReferenceType const * l = as<ast::ReferenceType>(lhs);
+                ast::ReferenceType const * r = as<ast::ReferenceType>(rhs);
+                // void references should match any other reference type.
+                return is<ast::VoidType>( l->base.get() )
+                        || is<ast::VoidType>( r->base.get() )
+                        || typesPolyCompatible( l->base.get(), r->base.get() );
+        } else if ( type_index(typeid(ast::ArrayType)) == lid ) {
+                ast::ArrayType const * l = as<ast::ArrayType>(lhs);
+                ast::ArrayType const * r = as<ast::ArrayType>(rhs);
+                if ( l->isVarLen ) {
+                        if ( !r->isVarLen ) return false;
+                } else {
+                        if ( r->isVarLen ) return false;
+                        auto lc = l->dimension.as<ast::ConstantExpr>();
+                        auto rc = r->dimension.as<ast::ConstantExpr>();
+                        if ( lc && rc && lc->intValue() != rc->intValue() ) {
+                                return false;
+                        }
+                }
+                return typesPolyCompatible( l->base.get(), r->base.get() );
+        } else if ( type_index(typeid(ast::FunctionType)) == lid ) {
+                ast::FunctionType const * l = as<ast::FunctionType>(lhs);
+                ast::FunctionType const * r = as<ast::FunctionType>(rhs);
+                std::vector<ast::ptr<ast::Type>> lparams, rparams;
+                flattenList( l->params, lparams );
+                flattenList( r->params, rparams );
+                if ( lparams.size() != rparams.size() ) return false;
+                for ( unsigned i = 0; i < lparams.size(); ++i ) {
+                        if ( !typesPolyCompatible( lparams[i], rparams[i] ) ) return false;
+                }
+                std::vector<ast::ptr<ast::Type>> lrets, rrets;
+                flattenList( l->returns, lrets );
+                flattenList( r->returns, rrets );
+                if ( lrets.size() != rrets.size() ) return false;
+                for ( unsigned i = 0; i < lrets.size(); ++i ) {
+                        if ( !typesPolyCompatible( lrets[i], rrets[i] ) ) return false;
+                }
+                return true;
+        } else if ( type_index(typeid(ast::StructInstType)) == lid ) {
+                ast::StructInstType const * l = as<ast::StructInstType>(lhs);
+                ast::StructInstType const * r = as<ast::StructInstType>(rhs);
+                if ( l->name != r->name ) return false;
+                return paramListsPolyCompatible( l->params, r->params );
+        } else if ( type_index(typeid(ast::UnionInstType)) == lid ) {
+                ast::UnionInstType const * l = as<ast::UnionInstType>(lhs);
+                ast::UnionInstType const * r = as<ast::UnionInstType>(rhs);
+                if ( l->name != r->name ) return false;
+                return paramListsPolyCompatible( l->params, r->params );
+        } else if ( type_index(typeid(ast::EnumInstType)) == lid ) {
+                ast::EnumInstType const * l = as<ast::EnumInstType>(lhs);
+                ast::EnumInstType const * r = as<ast::EnumInstType>(rhs);
+                return l->name == r->name;
+        } else if ( type_index(typeid(ast::TraitInstType)) == lid ) {
+                ast::TraitInstType const * l = as<ast::TraitInstType>(lhs);
+                ast::TraitInstType const * r = as<ast::TraitInstType>(rhs);
+                return l->name == r->name;
+        } else if ( type_index(typeid(ast::TupleType)) == lid ) {
+                ast::TupleType const * l = as<ast::TupleType>(lhs);
+                ast::TupleType const * r = as<ast::TupleType>(rhs);
+                std::vector<ast::ptr<ast::Type>> ltypes, rtypes;
+                flattenList( l->types, ( ltypes ) );
+                flattenList( r->types, ( rtypes ) );
+                if ( ltypes.size() != rtypes.size() ) return false;
+                for ( unsigned i = 0 ; i < ltypes.size() ; ++i ) {
+                        if ( !typesPolyCompatible( ltypes[i], rtypes[i] ) ) return false;
+                }
+                return true;
+        // The remaining types (VoidType, VarArgsType, ZeroType & OneType)
+        // have no variation so will always be equal.
+        } else {
+                return true;
+        }
+}
         namespace {
                 // temporary hack to avoid re-implementing anything related to TyVarMap

src/GenPoly/GenPoly.h

-              rebf8ca5
+              r23a08aa0
 // Author           : Richard C. Bilson
 // Created On       : Mon May 18 07:44:20 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Jul 22 09:22:57 2017
 // Update Count     : 7
+// Last Modified By : Andrew Beach
+// Last Modified On : Fri Aug 19 16:03:00 2022
+// Update Count     : 8
 //
 …
 namespace GenPoly {
+        // TODO Via some tricks this works for ast::TypeDecl::Data as well.
         typedef ErasableScopedMap< std::string, TypeDecl::Data > TyVarMap;
         /// Replaces a TypeInstType by its referrent in the environment, if applicable
         Type* replaceTypeInst( Type* type, const TypeSubstitution* env );
 …
         /// returns dynamic-layout type if is dynamic-layout type in tyVars, NULL otherwise; will look up substitution in env if provided
         ReferenceToType *isDynType( Type *type, const TyVarMap &tyVars, const TypeSubstitution *env = 0 );
+        const ast::BaseInstType *isDynType( const ast::Type *type, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs = 0 );
         /// true iff function has dynamic-layout return type under the given type variable map
 …
         /// true iff types are structurally identical, where TypeInstType's match any type.
         bool typesPolyCompatible( Type *aty, Type *bty );
+        bool typesPolyCompatible( ast::Type const * lhs, ast::Type const * rhs );
         /// true if arg requires boxing given exprTyVars

src/GenPoly/InstantiateGeneric.h

-              rebf8ca5
+              r23a08aa0
 class Declaration;
+namespace ast {
+        class TranslationUnit;
+}
 namespace GenPoly {
+        /// Replaces all generic types that have static layout with concrete instantiations.
+        /// Types with concrete values for otype parameters will be template-expanded, while
+        /// dtype and ftype parameters will be replaced by the appropriate void type.
+        void instantiateGeneric( std::list< Declaration* > &translationUnit );
+/// Replaces all generic types that have static layout with concrete
+/// instantiations. Types with concrete values for otype parameters will be
+/// template-expanded, while dtype and ftype parameters will be replaced by
+/// the appropriate void type.
+void instantiateGeneric( std::list< Declaration* > &translationUnit );
+void instantiateGeneric( ast::TranslationUnit & translationUnit );
 } // namespace GenPoly

src/GenPoly/Lvalue2.cc

rebf8ca5	r23a08aa0
23	23	}
24	24
25
26	25	}

src/GenPoly/ScrubTyVars.cc

-              rebf8ca5
+              r23a08aa0
 // Author           : Richard C. Bilson
 // Created On       : Mon May 18 07:44:20 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Thu Mar 16 15:44:27 2017
 // Update Count     : 3
+// Last Modified By : Andrew Beach
+// Last Modified On : Fri Aug 19 16:10:00 2022
+// Update Count     : 4
 //
 #include <utility>                      // for pair
+#include "AST/Pass.hpp"
 #include "GenPoly.h"                    // for mangleType, TyVarMap, alignof...
 #include "GenPoly/ErasableScopedMap.h"  // for ErasableScopedMap<>::const_it...
 #include "ScrubTyVars.h"
+#include "SymTab/Mangler.h"             // for mangle, typeMode
 #include "SynTree/Declaration.h"        // for TypeDecl, TypeDecl::Data, Typ...
 #include "SynTree/Expression.h"         // for Expression (ptr only), NameExpr
 …
                 return pointer;
+        }
+namespace {
+enum class ScrubMode {
+        FromMap,
+        DynamicFromMap,
+        All,
+};
+struct ScrubTypeVars :
+        public ast::WithGuards,
+        public ast::WithShortCircuiting,
+        public ast::WithVisitorRef<ScrubTypeVars> {
+        ScrubTypeVars( ScrubMode m, TyVarMap const * tv ) :
+                        mode ( m ), typeVars( tv ) {}
+        void previsit( ast::TypeInstType const * ) { visit_children = false; }
+        void previsit( ast::StructInstType const * ) { visit_children = false; }
+        void previsit( ast::UnionInstType const * ) { visit_children = false; }
+        void previsit( ast::SizeofExpr const * expr ) { primeBaseScrub( expr->type ); }
+        void previsit( ast::AlignofExpr const * expr ) { primeBaseScrub( expr->type ); }
+        void previsit( ast::PointerType const * type ) { primeBaseScrub( type->base ); }
+        ast::Type const * postvisit( ast::TypeInstType const * type );
+        ast::Type const * postvisit( ast::StructInstType const * type );
+        ast::Type const * postvisit( ast::UnionInstType const * type );
+        ast::Expr const * postvisit( ast::SizeofExpr const * expr );
+        ast::Expr const * postvisit( ast::AlignofExpr const * expr );
+        ast::Type const * postvisit( ast::PointerType const * type );
+private:
+        ScrubMode const mode;
+        /// Type varriables to scrub.
+        TyVarMap const * const typeVars;
+        /// Value cached by primeBaseScrub.
+        ast::Type const * dynType = nullptr;
+        /// Returns the type if it should be scrubbed, nullptr otherwise.
+        ast::Type const * shouldScrub( ast::Type const * type ) {
+                switch ( mode ) {
+                case ScrubMode::FromMap:
+                        return isPolyType( type, *typeVars );
+                case ScrubMode::DynamicFromMap:
+                        return isDynType( type, *typeVars );
+                case ScrubMode::All:
+                        return isPolyType( type );
+                default:
+                        assertf( false, "Invalid ScrubMode in shouldScrub." );
+                        throw;
+                }
+        }
+        void primeBaseScrub( ast::Type const * type ) {
+                // Need to determine whether type needs to be scrubbed to
+                // determine whether automatic recursion is necessary.
+                if ( ast::Type const * t = shouldScrub( type ) ) {
+                        visit_children = false;
+                        GuardValue( dynType ) = t;
+                }
+        }
+        ast::Type const * postvisitAggregateType(
+                        ast::BaseInstType const * type ) {
+                if ( !shouldScrub( type ) ) return type;
+                return new ast::PointerType( new ast::VoidType( type->qualifiers ) );
+        }
+};
+ast::Type const * ScrubTypeVars::postvisit( ast::TypeInstType const * type ) {
+        // This implies that mode == ScrubMode::All.
+        if ( !typeVars ) {
+                if ( ast::TypeDecl::Ftype == type->kind ) {
+                        return new ast::PointerType(
+                                new ast::FunctionType( ast::FixedArgs ) );
+                } else {
+                        return new ast::PointerType(
+                                new ast::VoidType( type->qualifiers ) );
+                }
+        }
+        auto typeVar = typeVars->find( type->name );
+        if ( typeVar == typeVars->end() ) {
+                return type;
+        }
+        switch ( typeVar->second.kind ) {
+        case ast::TypeDecl::Dtype:
+        case ast::TypeDecl::Ttype:
+                return new ast::PointerType(
+                        new ast::VoidType( type->qualifiers ) );
+        case ast::TypeDecl::Ftype:
+                return new ast::PointerType(
+                        new ast::FunctionType( ast::VariableArgs ) );
+        default:
+                assertf( false,
+                        "Unhandled type variable kind: %d", typeVar->second.kind );
+                throw; // Just in case the assert is removed, stop here.
+        }
+}
+ast::Type const * ScrubTypeVars::postvisit( ast::StructInstType const * type ) {
+        return postvisitAggregateType( type );
+}
+ast::Type const * ScrubTypeVars::postvisit( ast::UnionInstType const * type ) {
+        return postvisitAggregateType( type );
+}
+ast::Expr const * ScrubTypeVars::postvisit( ast::SizeofExpr const * expr ) {
+        // sizeof( T ) becomes the _sizeof_T parameter.
+        if ( dynType ) {
+                return new ast::NameExpr( expr->location,
+                        sizeofName( Mangle::mangle( dynType, Mangle::typeMode() ) ) );
+        } else {
+                return expr;
+        }
+}
+ast::Expr const * ScrubTypeVars::postvisit( ast::AlignofExpr const * expr ) {
+        // alignof( T ) becomes the _alignof_T parameter.
+        if ( dynType ) {
+                return new ast::NameExpr( expr->location,
+                        alignofName( Mangle::mangle( dynType, Mangle::typeMode() ) ) );
+        } else {
+                return expr;
+        }
+}
+ast::Type const * ScrubTypeVars::postvisit( ast::PointerType const * type ) {
+        if ( dynType ) {
+                ast::Type * ret = ast::mutate( dynType->accept( *visitor ) );
+                ret->qualifiers |= type->qualifiers;
+                return ret;
+        } else {
+                return type;
+        }
+}
+const ast::Node * scrubTypeVarsBase(
+                const ast::Node * target,
+                ScrubMode mode, const TyVarMap * typeVars ) {
+        if ( ScrubMode::All == mode ) {
+                assert( nullptr == typeVars );
+        } else {
+                assert( nullptr != typeVars );
+        }
+        ast::Pass<ScrubTypeVars> visitor( mode, typeVars );
+        return target->accept( visitor );
+}
+} // namespace
+template<>
+ast::Node const * scrubAllTypeVars<ast::Node>( const ast::Node * target ) {
+        return scrubTypeVarsBase( target, ScrubMode::All, nullptr );
+}
 } // namespace GenPoly

src/GenPoly/ScrubTyVars.h

-              rebf8ca5
+              r23a08aa0
 // Author           : Richard C. Bilson
 // Created On       : Mon May 18 07:44:20 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sat Jul 22 09:21:47 2017
 // Update Count     : 2
+// Last Modified By : Andrew Beach
+// Last Modified On : Fri Aug 19 14:14:00 2022
+// Update Count     : 3
 //
 …
 #include <cassert>            // for assert
+#include "AST/Fwd.hpp"        // for Node
 #include "Common/PassVisitor.h"
 #include "GenPoly.h"          // for TyVarMap, isPolyType, isDynType
 …
+        }
+/// For all polymorphic types, replaces generic types, with the appropriate
+/// void type, and sizeof/alignof expressions with the proper variable.
+template<typename node_t>
+node_t const * scrubAllTypeVars( node_t const * target ) {
+        return strict_dynamic_cast<node_t const *>( scrubAllTypeVars<ast::Node>( target ) );
+}
+template<>
+ast::Node const * scrubAllTypeVars<ast::Node>( const ast::Node * target );
 } // namespace GenPoly

src/GenPoly/SpecializeNew.cpp

-              rebf8ca5
+              r23a08aa0
+}
+namespace {
+        struct TypeInstFixer : public ast::WithShortCircuiting {
+                std::map<const ast::TypeDecl *, std::pair<int, int>> typeMap;
+                void previsit(const ast::TypeDecl *) { visit_children = false; }
+                const ast::TypeInstType * postvisit(const ast::TypeInstType * typeInst) {
+                        if (typeMap.count(typeInst->base)) {
+                                ast::TypeInstType * newInst = mutate(typeInst);
+                                auto const & pair = typeMap[typeInst->base];
+                                newInst->expr_id = pair.first;
+                                newInst->formal_usage = pair.second;
+                                return newInst;
+                        }
+                        return typeInst;
+                }
+        };
+}
+struct TypeInstFixer final : public ast::WithShortCircuiting {
+        std::map<const ast::TypeDecl *, std::pair<int, int>> typeMap;
+        void previsit(const ast::TypeDecl *) { visit_children = false; }
+        const ast::TypeInstType * postvisit(const ast::TypeInstType * typeInst) {
+                if (typeMap.count(typeInst->base)) {
+                        ast::TypeInstType * newInst = mutate(typeInst);
+                        auto const & pair = typeMap[typeInst->base];
+                        newInst->expr_id = pair.first;
+                        newInst->formal_usage = pair.second;
+                        return newInst;
+                }
+                return typeInst;
+        }
+};
 const ast::Expr * SpecializeCore::createThunkFunction(

src/GenPoly/module.mk

rebf8ca5	r23a08aa0
27	27	GenPoly/FindFunction.cc \
28	28	GenPoly/FindFunction.h \
	29	GenPoly/InstantiateGenericNew.cpp \
29	30	GenPoly/InstantiateGeneric.cc \
30	31	GenPoly/InstantiateGeneric.h \

src/InitTweak/InitTweak.cc

-              rebf8ca5
+              r23a08aa0
         static const char * const tlsd_section = ".tdata" ASM_COMMENT;
         void addDataSectionAttribute( ObjectDecl * objDecl ) {
                 const bool is_tls = objDecl->get_storageClasses().is_threadlocal;
+                const bool is_tls = objDecl->get_storageClasses().is_threadlocal_any();
                 const char * section = is_tls ? tlsd_section : data_section;
                 objDecl->attributes.push_back(new Attribute("section", {
 …
         void addDataSectionAttribute( ast::ObjectDecl * objDecl ) {
                 const bool is_tls = objDecl->storage.is_threadlocal;
+                const bool is_tls = objDecl->storage.is_threadlocal_any();
                 const char * section = is_tls ? tlsd_section : data_section;
                 objDecl->attributes.push_back(new ast::Attribute("section", {

src/Makefile.am

rebf8ca5	r23a08aa0
71	71	EXTRA_DIST = include/cassert include/optional BasicTypes-gen.cc
72	72
73		AM_CXXFLAGS = @HOST_FLAGS@ -Wno-deprecated -Wall -Wextra -Werror=return-type -DDEBUG_ALL -I./Parser -I$(srcdir)/Parser -I$(srcdir)/include -DYY_NO_INPUT -O3 -g -std=c++14 $(TCMALLOCFLAG)
	73	AM_CXXFLAGS = @HOST_FLAGS@ -Wno-deprecated -Wall -Wextra -Werror=return-type -DDEBUG_ALL -I./Parser -I$(srcdir)/Parser -I$(srcdir)/include -DYY_NO_INPUT -O3 -g -std=c++17 $(TCMALLOCFLAG)
74	74	AM_LDFLAGS = @HOST_FLAGS@ -Xlinker -export-dynamic
75	75	ARFLAGS = cr

src/Parser/DeclarationNode.cc

-              rebf8ca5
+              r23a08aa0
         newnode->type->enumeration.anon = name == nullptr;
         if ( base && base->type)  {
                 newnode->type->base = base->type;
+                newnode->type->base = base->type;
         } // if
 …
                         } // for
                         // src is the new item being added and has a single bit
                 } else if ( ! src->storageClasses.is_threadlocal ) { // conflict ?
+                } else if ( ! src->storageClasses.is_threadlocal_any() ) { // conflict ?
                         appendError( error, string( "conflicting " ) + Type::StorageClassesNames[storageClasses.ffs()] +
                                                  " & " + Type::StorageClassesNames[src->storageClasses.ffs()] );

src/Parser/lex.ll

-              rebf8ca5
+              r23a08aa0
  * Created On       : Sat Sep 22 08:58:10 2001
  * Last Modified By : Peter A. Buhr
  * Last Modified On : Sun Jun 20 18:41:09 2021
  * Update Count     : 759
+ * Last Modified On : Tue Aug 30 18:39:54 2022
+ * Update Count     : 760
  */
 …
 switch                  { KEYWORD_RETURN(SWITCH); }
 thread                  { KEYWORD_RETURN(THREAD); }                             // C11
+_Thread_local   { KEYWORD_RETURN(THREADLOCAL); }                // C11
+__thread                { KEYWORD_RETURN(THREADLOCALGCC); }             // GCC
+_Thread_local   { KEYWORD_RETURN(THREADLOCALC11); }             // C11
 throw                   { KEYWORD_RETURN(THROW); }                              // CFA
 throwResume             { KEYWORD_RETURN(THROWRESUME); }                // CFA

src/Parser/parser.yy

-              rebf8ca5
+              r23a08aa0
 // lex uses __null in a boolean context, it's fine.
+//#pragma GCC diagnostic ignored "-Wparentheses-equality"
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Wparentheses-equality"
+#pragma GCC diagnostic warning "-Wpragmas"
 extern DeclarationNode * parseTree;
 …
 %token TYPEDEF
 %token EXTERN STATIC AUTO REGISTER
 %token THREADLOCAL                                                                              // C11
+%token THREADLOCALGCC THREADLOCALC11                                            // GCC, C11
 %token INLINE FORTRAN                                                                   // C99, extension ISO/IEC 9899:1999 Section J.5.9(1)
 %token NORETURN                                                                                 // C11
 …
+                {
                         if ( $2 == OperKinds::LThan || $2 == OperKinds::LEThan ) { SemanticError( yylloc, MISSING_ANON_FIELD ); $$ = nullptr; }
                         else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; }
+                        else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; }
+                }
         | comma_expression updowneq comma_expression '~' comma_expression // CFA, anonymous loop-index
 …
+                {
                         if ( $2 == OperKinds::LThan || $2 == OperKinds::LEThan ) { SemanticError( yylloc, MISSING_ANON_FIELD ); $$ = nullptr; }
                         else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; }
+                        else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; }
+                }
         | comma_expression updowneq comma_expression '~' '@' // CFA, error
 …
         | REGISTER
                 { $$ = DeclarationNode::newStorageClass( Type::Register ); }
+        | THREADLOCAL                                                                           // C11
+                { $$ = DeclarationNode::newStorageClass( Type::Threadlocal ); }
+        | THREADLOCALGCC                                                                                // GCC
+                { $$ = DeclarationNode::newStorageClass( Type::ThreadlocalGcc ); }
+        | THREADLOCALC11                                                                                // C11
+                { $$ = DeclarationNode::newStorageClass( Type::ThreadlocalC11 ); }
                 // Put function specifiers here to simplify parsing rules, but separate them semantically.
         | INLINE                                                                                        // C99

src/ResolvExpr/CandidateFinder.cpp

-              rebf8ca5
+              r23a08aa0
                         unsigned nextArg, unsigned tupleStart = 0, Cost cost = Cost::zero,
                         unsigned nextExpl = 0, unsigned explAlt = 0 )
                 : parent(parent), expr( expr ), cost( cost ), env( move( env ) ), need( move( need ) ),
                   have( move( have ) ), open( move( open ) ), nextArg( nextArg ), tupleStart( tupleStart ),
+                : parent(parent), expr( expr ), cost( cost ), env( std::move( env ) ), need( std::move( need ) ),
+                  have( std::move( have ) ), open( std::move( open ) ), nextArg( nextArg ), tupleStart( tupleStart ),
                   nextExpl( nextExpl ), explAlt( explAlt ) {}
 …
                         const ArgPack & o, ast::TypeEnvironment && env, ast::AssertionSet && need,
                         ast::AssertionSet && have, ast::OpenVarSet && open, unsigned nextArg, Cost added )
                 : parent( o.parent ), expr( o.expr ), cost( o.cost + added ), env( move( env ) ),
                   need( move( need ) ), have( move( have ) ), open( move( open ) ), nextArg( nextArg ),
+                : parent( o.parent ), expr( o.expr ), cost( o.cost + added ), env( std::move( env ) ),
+                  need( std::move( need ) ), have( std::move( have ) ), open( std::move( open ) ), nextArg( nextArg ),
                   tupleStart( o.tupleStart ), nextExpl( 0 ), explAlt( 0 ) {}
 …
                         // reset pack to appropriate tuple
                         std::vector< ast::ptr< ast::Expr > > exprv( exprs.begin(), exprs.end() );
                         expr = new ast::TupleExpr{ expr->location, move( exprv ) };
+                        expr = new ast::TupleExpr{ expr->location, std::move( exprv ) };
                         tupleStart = pack->tupleStart - 1;
                         parent = pack->parent;
 …
                                                                 newResult.open, symtab )
                                                 ) {
                                                         finalResults.emplace_back( move( newResult ) );
+                                                        finalResults.emplace_back( std::move( newResult ) );
+                                                }
 …
                                                 if ( expl.exprs.empty() ) {
                                                         results.emplace_back(
                                                                 results[i], move( env ), copy( results[i].need ),
                                                                 copy( results[i].have ), move( open ), nextArg + 1, expl.cost );
+                                                                results[i], std::move( env ), copy( results[i].need ),
+                                                                copy( results[i].have ), std::move( open ), nextArg + 1, expl.cost );
                                                         continue;
 …
                                                 // add new result
                                                 results.emplace_back(
                                                         i, expl.exprs.front(), move( env ), copy( results[i].need ),
                                                         copy( results[i].have ), move( open ), nextArg + 1, nTuples,
+                                                        i, expl.exprs.front(), std::move( env ), copy( results[i].need ),
+                                                        copy( results[i].have ), std::move( open ), nextArg + 1, nTuples,
                                                         expl.cost, expl.exprs.size() == 1 ? 0 : 1, j );
+                                        }
 …
                         // splice final results onto results
                         for ( std::size_t i = 0; i < finalResults.size(); ++i ) {
                                 results.emplace_back( move( finalResults[i] ) );
+                                results.emplace_back( std::move( finalResults[i] ) );
+                        }
                         return ! finalResults.empty();
 …
                                         results.emplace_back(
                                                 i, expr, move( env ), move( need ), move( have ), move( open ), nextArg,
+                                                i, expr, std::move( env ), std::move( need ), std::move( have ), std::move( open ), nextArg,
                                                 nTuples, Cost::zero, nextExpl, results[i].explAlt );
+                                }
 …
                                         if ( unify( paramType, cnst->result, env, need, have, open, symtab ) ) {
                                                 results.emplace_back(
                                                         i, new ast::DefaultArgExpr{ cnst->location, cnst }, move( env ),
                                                         move( need ), move( have ), move( open ), nextArg, nTuples );
+                                                        i, new ast::DefaultArgExpr{ cnst->location, cnst }, std::move( env ),
+                                                        std::move( need ), std::move( have ), std::move( open ), nextArg, nTuples );
+                                        }
+                                }
 …
                                 if ( expl.exprs.empty() ) {
                                         results.emplace_back(
                                                 results[i], move( env ), move( need ), move( have ), move( open ),
+                                                results[i], std::move( env ), std::move( need ), std::move( have ), std::move( open ),
                                                 nextArg + 1, expl.cost );
 …
                                         // add new result
                                         results.emplace_back(
                                                 i, expr, move( env ), move( need ), move( have ), move( open ),
+                                                i, expr, std::move( env ), std::move( need ), std::move( have ), std::move( open ),
                                                 nextArg + 1, nTuples, expl.cost, expl.exprs.size() == 1 ? 0 : 1, j );
+                                }
 …
                                         restructureCast( idx, toType->getComponent( i ), isGenerated ) );
+                        }
                         return new ast::TupleExpr{ arg->location, move( components ) };
+                        return new ast::TupleExpr{ arg->location, std::move( components ) };
                 } else {
                         // handle normally
 …
+                        }
                         std::vector< ast::ptr< ast::Expr > > vargs( args.begin(), args.end() );
                         appExpr->args = move( vargs );
+                        appExpr->args = std::move( vargs );
                         // build and validate new candidate
                         auto newCand =
 …
                                                         if ( expl.exprs.empty() ) {
                                                                 results.emplace_back(
                                                                         results[i], move( env ), copy( results[i].need ),
                                                                         copy( results[i].have ), move( open ), nextArg + 1,
+                                                                        results[i], std::move( env ), copy( results[i].need ),
+                                                                        copy( results[i].have ), std::move( open ), nextArg + 1,
                                                                         expl.cost );
 …
                                                         // add new result
                                                         results.emplace_back(
                                                                 i, expl.exprs.front(), move( env ), copy( results[i].need ),
                                                                 copy( results[i].have ), move( open ), nextArg + 1, 0, expl.cost,
+                                                                i, expl.exprs.front(), std::move( env ), copy( results[i].need ),
+                                                                copy( results[i].have ), std::move( open ), nextArg + 1, 0, expl.cost,
                                                                 expl.exprs.size() == 1 ? 0 : 1, j );
+                                                }
 …
                                 // as a member expression
                                 addAnonConversions( newCand );
                                 candidates.emplace_back( move( newCand ) );
+                                candidates.emplace_back( std::move( newCand ) );
+                        }
+                }
 …
                                                         const ast::EnumDecl * enumDecl = enumInst->base;
                                                         if ( const ast::Type* enumType = enumDecl->base ) {
                                                                 // instance of enum (T) is a instance of type (T)
+                                                                // instance of enum (T) is a instance of type (T)
                                                                 funcFinder.otypeKeys.insert(Mangle::mangle(enumType, Mangle::NoGenericParams | Mangle::Type));
                                                         } else {
 …
                                                                 funcFinder.otypeKeys.insert(Mangle::mangle(enumDecl, Mangle::NoGenericParams | Mangle::Type));
+                                                        }
+                                                }
+                                                }
                                                 else funcFinder.otypeKeys.insert(Mangle::mangle(argType, Mangle::NoGenericParams | Mangle::Type));
+                                        }
 …
                                         funcE.emplace_back( *func, symtab );
+                                }
                                 argExpansions.emplace_front( move( funcE ) );
+                                argExpansions.emplace_front( std::move( funcE ) );
                                 for ( const CandidateRef & op : opFinder ) {
 …
                                 if ( cvtCost != Cost::infinity ) {
                                         withFunc->cvtCost = cvtCost;
                                         candidates.emplace_back( move( withFunc ) );
+                                }
+                        }
                         found = move( candidates );
+                                        candidates.emplace_back( std::move( withFunc ) );
+                                }
+                        }
+                        found = std::move( candidates );
                         // use a new list so that candidates are not examined by addAnonConversions twice
 …
                                         CandidateRef newCand = std::make_shared<Candidate>(
                                                 restructureCast( cand->expr, toType, castExpr->isGenerated ),
                                                 copy( cand->env ), move( open ), move( need ), cand->cost,
+                                                copy( cand->env ), std::move( open ), std::move( need ), cand->cost,
                                                 cand->cost + thisCost );
                                         inferParameters( newCand, matches );
 …
                                 // as a name expression
                                 addAnonConversions( newCand );
                                 candidates.emplace_back( move( newCand ) );
+                                candidates.emplace_back( std::move( newCand ) );
+                        }
+                }
 …
                                                 new ast::LogicalExpr{
                                                         logicalExpr->location, r1->expr, r2->expr, logicalExpr->isAnd },
                                                 move( env ), move( open ), move( need ), r1->cost + r2->cost );
+                                                std::move( env ), std::move( open ), std::move( need ), r1->cost + r2->cost );
+                                }
+                        }
 …
                                                         // output candidate
                                                         CandidateRef newCand = std::make_shared<Candidate>(
                                                                 newExpr, move( env ), move( open ), move( need ), cost );
+                                                                newExpr, std::move( env ), std::move( open ), std::move( need ), cost );
                                                         inferParameters( newCand, candidates );
+                                                }
 …
                                                 // add candidate
                                                 CandidateRef newCand = std::make_shared<Candidate>(
                                                         newExpr, move( env ), move( open ), move( need ),
+                                                        newExpr, std::move( env ), std::move( open ), std::move( need ),
                                                         r1->cost + r2->cost );
                                                 inferParameters( newCand, candidates );
 …
                                 addCandidate(
                                         new ast::TupleExpr{ tupleExpr->location, move( exprs ) },
                                         move( env ), move( open ), move( need ), sumCost( subs ) );
+                                        new ast::TupleExpr{ tupleExpr->location, std::move( exprs ) },
+                                        std::move( env ), std::move( open ), std::move( need ), sumCost( subs ) );
+                        }
+                }
 …
                                                                 initExpr->location, restructureCast( cand->expr, toType ),
                                                                 initAlt.designation },
                                                         move(env), move( open ), move( need ), cand->cost, thisCost );
+                                                        std::move(env), std::move( open ), std::move( need ), cand->cost, thisCost );
                                                 inferParameters( newCand, matches );
+                                        }
 …
                 cand->env.applyFree( newResult );
                 cand->expr = ast::mutate_field(
                         cand->expr.get(), &ast::Expr::result, move( newResult ) );
+                        cand->expr.get(), &ast::Expr::result, std::move( newResult ) );
                 out.emplace_back( cand );
 …
                 auto oldsize = candidates.size();
                 candidates = move( pruned );
+                candidates = std::move( pruned );
                 PRINT(

src/SynTree/Statement.cc

-              rebf8ca5
+              r23a08aa0
 };
 BranchStmt::BranchStmt( Label target, Type type ) throw ( SemanticErrorException ) :
+BranchStmt::BranchStmt( Label target, Type type ) :
         Statement(), originalTarget( target ), target( target ), computedTarget( nullptr ), type( type ) {
         //actually this is a syntactic error signaled by the parser
 …
+}
 BranchStmt::BranchStmt( Expression * computedTarget, Type type ) throw ( SemanticErrorException ) :
+BranchStmt::BranchStmt( Expression * computedTarget, Type type ) :
         Statement(), computedTarget( computedTarget ), type( type ) {
         if ( type != BranchStmt::Goto || computedTarget == nullptr ) {
 …
+}
 CaseStmt::CaseStmt( Expression * condition, const list<Statement *> & statements, bool deflt ) throw ( SemanticErrorException ) :
+CaseStmt::CaseStmt( Expression * condition, const list<Statement *> & statements, bool deflt ) :
                 Statement(), condition( condition ), stmts( statements ), _isDefault( deflt ) {
         if ( isDefault() && condition != nullptr ) SemanticError( condition, "default case with condition: " );
 …
+}
 MutexStmt::MutexStmt( Statement * stmt, const list<Expression *> mutexObjs )
+MutexStmt::MutexStmt( Statement * stmt, const list<Expression *> mutexObjs )
         : Statement(), stmt( stmt ), mutexObjs( mutexObjs ) { }

src/SynTree/Statement.h

-              rebf8ca5
+              r23a08aa0
         std::list<Statement *> stmts;
         CaseStmt( Expression * conditions, const std::list<Statement *> & stmts, bool isdef = false ) throw (SemanticErrorException);
+        CaseStmt( Expression * conditions, const std::list<Statement *> & stmts, bool isdef = false );
         CaseStmt( const CaseStmt & other );
         virtual ~CaseStmt();
 …
         Type type;
         BranchStmt( Label target, Type ) throw (SemanticErrorException);
         BranchStmt( Expression * computedTarget, Type ) throw (SemanticErrorException);
+        BranchStmt( Label target, Type );
+        BranchStmt( Expression * computedTarget, Type );
         Label get_originalTarget() { return originalTarget; }

src/SynTree/Type.cc

rebf8ca5	r23a08aa0
80	80	// These must remain in the same order as the corresponding bit fields.
81	81	const char * Type::FuncSpecifiersNames[] = { "inline", "_Noreturn", "fortran" };
82		const char * Type::StorageClassesNames[] = { "extern", "static", "auto", "register", "_Thread_local" };
	82	const char * Type::StorageClassesNames[] = { "extern", "static", "auto", "register", "__thread", "_Thread_local" };
83	83	const char * Type::QualifiersNames[] = { "const", "restrict", "volatile", "mutex", "_Atomic" };
84	84

src/SynTree/Type.h

-              rebf8ca5
+              r23a08aa0
         }; // FuncSpecifiers
         enum { Extern = 1 << 0, Static = 1 << 1, Auto = 1 << 2, Register = 1 << 3, Threadlocal = 1 << 4, NumStorageClass = 5 };
+        enum { Extern = 1 << 0, Static = 1 << 1, Auto = 1 << 2, Register = 1 << 3, ThreadlocalGcc = 1 << 4, ThreadlocalC11 = 1 << 5, NumStorageClass = 6 };
         static const char * StorageClassesNames[];
         union StorageClasses {
 …
                         bool is_auto : 1;
                         bool is_register : 1;
+                        bool is_threadlocal : 1;
+                        bool is_threadlocalGcc : 1;
+                        bool is_threadlocalC11 : 1;
                 };
 …
                 // equality (==, !=) works implicitly on first field "val", relational operations are undefined.
                 BFCommon( StorageClasses, NumStorageClass )
+                bool is_threadlocal_any() { return this->is_threadlocalC11 || this->is_threadlocalGcc; }
         }; // StorageClasses

src/Tuples/TupleExpansionNew.cpp

-              rebf8ca5
+              r23a08aa0
 /// Replaces Tuple Assign & Index Expressions, and Tuple Types.
 struct TupleMainExpander :
+struct TupleMainExpander final :
                 public ast::WithGuards,
                 public ast::WithVisitorRef<TupleMainExpander>,
 …
+}
 struct TupleExprExpander {
+struct TupleExprExpander final {
         ast::Expr const * postvisit( ast::TupleExpr const * expr ) {
                 return replaceTupleExpr( expr->location,

src/Virtual/ExpandCasts.cc

-              rebf8ca5
+              r23a08aa0
 };
 struct ExpandCastsCore {
+struct ExpandCastsCore final {
         void previsit( ast::FunctionDecl const * decl );
         void previsit( ast::StructDecl const * decl );
 …
+}
+/// Copy newType, but give the copy the params of the oldType.
 ast::StructInstType * polyCopy(
                 ast::StructInstType const * oldType,

src/config.h.in

-              rebf8ca5
+              r23a08aa0
 /* Location of cfa install. */
 #undef CFA_PREFIX
-/* Sets whether or not to use the new-ast, this is adefault value and can be
-   overrided by --old-ast and --new-ast */
-#undef CFA_USE_NEW_AST
 /* Major.Minor */

src/main.cc

-              rebf8ca5
+              r23a08aa0
 // Created On       : Fri May 15 23:12:02 2015
 // Last Modified By : Andrew Beach
 // Last Modified On : Thu 11 12:18:00 2022
 // Update Count     : 677
+// Last Modified On : Thu Sep 15 13:58:00 2022
+// Update Count     : 678
 //
 …
 #include "CodeGen/Generate.h"               // for generate
 #include "CodeGen/LinkOnce.h"               // for translateLinkOnce
-#include "CodeTools/DeclStats.h"            // for printDeclStats
-#include "CodeTools/ResolvProtoDump.h"      // for dumpAsResolvProto
 #include "CodeTools/TrackLoc.h"             // for fillLocations
 #include "Common/CodeLocationTools.hpp"     // for forceFillCodeLocations
 …
 #include "Common/DeclStats.hpp"             // for printDeclStats
 #include "Common/ResolvProtoDump.hpp"       // for dumpAsResolverProto
+#include "Common/Stats.h"
+#include "Common/PassVisitor.h"
+#include "Common/SemanticError.h"           // for SemanticError
+#include "Common/Stats.h"                   // for Stats
 #include "Common/UnimplementedError.h"      // for UnimplementedError
 #include "Common/utility.h"                 // for deleteAll, filter, printAll
 …
 #include "Concurrency/Waitfor.h"            // for generateWaitfor
 #include "ControlStruct/ExceptDecl.h"       // for translateExcept
 #include "ControlStruct/ExceptTranslate.h"  // for translateEHM
+#include "ControlStruct/ExceptTranslate.h"  // for translateThrows, translat...
 #include "ControlStruct/FixLabels.hpp"      // for fixLabels
 #include "ControlStruct/HoistControlDecls.hpp" //  hoistControlDecls
-#include "ControlStruct/Mutate.h"           // for mutate
 #include "GenPoly/Box.h"                    // for box
 #include "GenPoly/InstantiateGeneric.h"     // for instantiateGeneric
 …
 #include "Parser/ParseNode.h"               // for DeclarationNode, buildList
 #include "Parser/TypedefTable.h"            // for TypedefTable
-#include "ResolvExpr/AlternativePrinter.h"  // for AlternativePrinter
 #include "ResolvExpr/CandidatePrinter.hpp"  // for printCandidates
 #include "ResolvExpr/Resolver.h"            // for resolve
-#include "SymTab/Validate.h"                // for validate
-#include "SymTab/ValidateType.h"            // for linkReferenceToTypes
 #include "SynTree/LinkageSpec.h"            // for Spec, Cforall, Intrinsic
 #include "SynTree/Declaration.h"            // for Declaration
-#include "SynTree/Visitor.h"                // for acceptAll
 #include "Tuples/Tuples.h"                  // for expandMemberTuples, expan...
 #include "Validate/Autogen.hpp"             // for autogenerateRoutines
 …
                 Stats::Time::StopBlock();
+                if( useNewAST ) {
+                        if (Stats::Counters::enabled) {
+                                ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New");
+                                ast::pass_visitor_stats.max = Stats::Counters::build<Stats::Counters::MaxCounter<double>>("Max depth - New");
+                        }
+                        auto transUnit = convert( move( translationUnit ) );
+                        forceFillCodeLocations( transUnit );
+                        PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) );
+                        if ( exdeclp ) {
+                                dump( move( transUnit ) );
+                                return EXIT_SUCCESS;
+                        }
+                        PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) );
+                        PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) );
+                        // Hoist Type Decls pulls some declarations out of contexts where
+                        // locations are not tracked. Perhaps they should be, but for now
+                        // the full fill solves it.
+                        forceFillCodeLocations( transUnit );
+                        PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) );
+                        PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) );
+                        PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) );
+                        PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) );
+                        PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) );
+                        PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) );
+                        PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) );
+                        PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) );
+                        PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) );
+                        PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) );
+                        PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) );
+                        PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) );
+                        PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) );
+                        PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) );
+                        PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) );
+                        PASS( "Implement Mutex", Concurrency::implementMutex( transUnit ) );
+                        PASS( "Implement Thread Start", Concurrency::implementThreadStarter( transUnit ) );
+                        PASS( "Compound Literal", Validate::handleCompoundLiterals( transUnit ) );
+                        PASS( "Set Length From Initializer", Validate::setLengthFromInitializer( transUnit ) );
+                        PASS( "Find Global Decls", Validate::findGlobalDecls( transUnit ) );
+                        PASS( "Fix Label Address", Validate::fixLabelAddresses( transUnit ) );
+                        if ( symtabp ) {
+                                return EXIT_SUCCESS;
+                        } // if
+                        if ( expraltp ) {
+                                ResolvExpr::printCandidates( transUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        if ( validp ) {
+                                dump( move( transUnit ) );
+                                return EXIT_SUCCESS;
+                        } // if
+                        PASS( "Translate Throws", ControlStruct::translateThrows( transUnit ) );
+                        PASS( "Fix Labels", ControlStruct::fixLabels( transUnit ) );
+                        PASS( "Fix Names", CodeGen::fixNames( transUnit ) );
+                        PASS( "Gen Init", InitTweak::genInit( transUnit ) );
+                        PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( transUnit ) );
+                        if ( libcfap ) {
+                                // Generate the bodies of cfa library functions.
+                                LibCfa::makeLibCfa( transUnit );
+                        } // if
+                        if ( declstatsp ) {
+                                printDeclStats( transUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        if ( bresolvep ) {
+                                dump( move( transUnit ) );
+                                return EXIT_SUCCESS;
+                        } // if
+                        if ( resolvprotop ) {
+                                dumpAsResolverProto( transUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        PASS( "Resolve", ResolvExpr::resolve( transUnit ) );
+                        if ( exprp ) {
+                                dump( move( transUnit ) );
+                                return EXIT_SUCCESS;
+                        } // if
+                        forceFillCodeLocations( transUnit );
+                        PASS( "Fix Init", InitTweak::fix(transUnit, buildingLibrary()));
+                        // fix ObjectDecl - replaces ConstructorInit nodes
+                        if ( ctorinitp ) {
+                                dump( move( transUnit ) );
+                                return EXIT_SUCCESS;
+                        } // if
+                        // Currently not working due to unresolved issues with UniqueExpr
+                        PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( transUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
+                        PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) );
+                        PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) );
+                        // Needs to happen before tuple types are expanded.
+                        PASS( "Convert Specializations",  GenPoly::convertSpecializations( transUnit ) );
+                        PASS( "Expand Tuples", Tuples::expandTuples( transUnit ) );
+                        if ( tuplep ) {
+                                dump( move( transUnit ) );
+                                return EXIT_SUCCESS;
+                        } // if
+                        // Must come after Translate Tries.
+                        PASS( "Virtual Expand Casts", Virtual::expandCasts( transUnit ) );
+                        translationUnit = convert( move( transUnit ) );
+                } else {
+                        PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) );
+                        if ( exdeclp ) {
+                                dump( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        // add the assignment statement after the initialization of a type parameter
+                        PASS( "Validate", SymTab::validate( translationUnit ) );
+                        if ( symtabp ) {
+                                deleteAll( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        if ( expraltp ) {
+                                PassVisitor<ResolvExpr::AlternativePrinter> printer( cout );
+                                acceptAll( translationUnit, printer );
+                                return EXIT_SUCCESS;
+                        } // if
+                        if ( validp ) {
+                                dump( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        PASS( "Translate Throws", ControlStruct::translateThrows( translationUnit ) );
+                        PASS( "Fix Labels", ControlStruct::fixLabels( translationUnit ) );
+                        PASS( "Fix Names", CodeGen::fixNames( translationUnit ) );
+                        PASS( "Gen Init", InitTweak::genInit( translationUnit ) );
+                        PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( translationUnit ) );
+                        if ( libcfap ) {
+                                // Generate the bodies of cfa library functions.
+                                LibCfa::makeLibCfa( translationUnit );
+                        } // if
+                        if ( declstatsp ) {
+                                CodeTools::printDeclStats( translationUnit );
+                                deleteAll( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        if ( bresolvep ) {
+                                dump( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        CodeTools::fillLocations( translationUnit );
+                        if ( resolvprotop ) {
+                                CodeTools::dumpAsResolvProto( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        PASS( "Resolve", ResolvExpr::resolve( translationUnit ) );
+                        if ( exprp ) {
+                                dump( translationUnit );
+                                return EXIT_SUCCESS;
+                        }
+                        PASS( "Fix Init", InitTweak::fix( translationUnit, buildingLibrary() ) );
+                        // fix ObjectDecl - replaces ConstructorInit nodes
+                        if ( ctorinitp ) {
+                                dump ( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( translationUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
+                        PASS( "Translate Tries", ControlStruct::translateTries( translationUnit ) );
+                        PASS( "Gen Waitfor", Concurrency::generateWaitFor( translationUnit ) );
+                        PASS( "Convert Specializations",  GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded
+                        PASS( "Expand Tuples", Tuples::expandTuples( translationUnit ) ); // xxx - is this the right place for this?
+                        if ( tuplep ) {
+                                dump( translationUnit );
+                                return EXIT_SUCCESS;
+                        } // if
+                        PASS( "Virtual Expand Casts", Virtual::expandCasts( translationUnit ) ); // Must come after translateEHM
+                if (Stats::Counters::enabled) {
+                        ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New");
+                        ast::pass_visitor_stats.max = Stats::Counters::build<Stats::Counters::MaxCounter<double>>("Max depth - New");
+                }
+                PASS( "Instantiate Generics", GenPoly::instantiateGeneric( translationUnit ) );
+                auto transUnit = convert( std::move( translationUnit ) );
+                forceFillCodeLocations( transUnit );
+                PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) );
+                if ( exdeclp ) {
+                        dump( std::move( transUnit ) );
+                        return EXIT_SUCCESS;
+                }
+                PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) );
+                PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) );
+                // Hoist Type Decls pulls some declarations out of contexts where
+                // locations are not tracked. Perhaps they should be, but for now
+                // the full fill solves it.
+                forceFillCodeLocations( transUnit );
+                PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) );
+                PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) );
+                PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) );
+                PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) );
+                PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) );
+                PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) );
+                PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) );
+                PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) );
+                PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) );
+                PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) );
+                PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) );
+                PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) );
+                PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) );
+                PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) );
+                PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) );
+                PASS( "Implement Mutex", Concurrency::implementMutex( transUnit ) );
+                PASS( "Implement Thread Start", Concurrency::implementThreadStarter( transUnit ) );
+                PASS( "Compound Literal", Validate::handleCompoundLiterals( transUnit ) );
+                PASS( "Set Length From Initializer", Validate::setLengthFromInitializer( transUnit ) );
+                PASS( "Find Global Decls", Validate::findGlobalDecls( transUnit ) );
+                PASS( "Fix Label Address", Validate::fixLabelAddresses( transUnit ) );
+                if ( symtabp ) {
+                        return EXIT_SUCCESS;
+                } // if
+                if ( expraltp ) {
+                        ResolvExpr::printCandidates( transUnit );
+                        return EXIT_SUCCESS;
+                } // if
+                if ( validp ) {
+                        dump( std::move( transUnit ) );
+                        return EXIT_SUCCESS;
+                } // if
+                PASS( "Translate Throws", ControlStruct::translateThrows( transUnit ) );
+                PASS( "Fix Labels", ControlStruct::fixLabels( transUnit ) );
+                PASS( "Fix Names", CodeGen::fixNames( transUnit ) );
+                PASS( "Gen Init", InitTweak::genInit( transUnit ) );
+                PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( transUnit ) );
+                if ( libcfap ) {
+                        // Generate the bodies of cfa library functions.
+                        LibCfa::makeLibCfa( transUnit );
+                } // if
+                if ( declstatsp ) {
+                        printDeclStats( transUnit );
+                        return EXIT_SUCCESS;
+                } // if
+                if ( bresolvep ) {
+                        dump( std::move( transUnit ) );
+                        return EXIT_SUCCESS;
+                } // if
+                if ( resolvprotop ) {
+                        dumpAsResolverProto( transUnit );
+                        return EXIT_SUCCESS;
+                } // if
+                PASS( "Resolve", ResolvExpr::resolve( transUnit ) );
+                if ( exprp ) {
+                        dump( std::move( transUnit ) );
+                        return EXIT_SUCCESS;
+                } // if
+                forceFillCodeLocations( transUnit );
+                PASS( "Fix Init", InitTweak::fix(transUnit, buildingLibrary()));
+                // fix ObjectDecl - replaces ConstructorInit nodes
+                if ( ctorinitp ) {
+                        dump( std::move( transUnit ) );
+                        return EXIT_SUCCESS;
+                } // if
+                // Currently not working due to unresolved issues with UniqueExpr
+                PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( transUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused
+                PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) );
+                PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) );
+                // Needs to happen before tuple types are expanded.
+                PASS( "Convert Specializations",  GenPoly::convertSpecializations( transUnit ) );
+                PASS( "Expand Tuples", Tuples::expandTuples( transUnit ) );
+                if ( tuplep ) {
+                        dump( std::move( transUnit ) );
+                        return EXIT_SUCCESS;
+                } // if
+                // Must come after Translate Tries.
+                PASS( "Virtual Expand Casts", Virtual::expandCasts( transUnit ) );
+                PASS( "Instantiate Generics", GenPoly::instantiateGeneric( transUnit ) );
+                translationUnit = convert( std::move( transUnit ) );
                 if ( genericsp ) {
                         dump( translationUnit );
 …
 static const char optstring[] = ":c:ghlLmNnpdOAP:S:twW:D:";
+static const char optstring[] = ":c:ghlLmNnpdP:S:twW:D:";
 enum { PreludeDir = 128 };
 …
         { "prototypes", no_argument, nullptr, 'p' },
         { "deterministic-out", no_argument, nullptr, 'd' },
-        { "old-ast", no_argument, nullptr, 'O'},
-        { "new-ast", no_argument, nullptr, 'A'},
         { "print", required_argument, nullptr, 'P' },
         { "prelude-dir", required_argument, nullptr, PreludeDir },
 …
         "do not generate prelude prototypes => prelude not printed", // -p
         "only print deterministic output",                  // -d
-        "Use the old-ast",                                                                      // -O
-        "Use the new-ast",                                                                      // -A
         "print",                                                                                        // -P
         "<directory> prelude directory for debug/nodebug",      // no flag
 …
                         deterministic_output = true;
                         break;
-                  case 'O':                                     // don't print non-deterministic output
-                        useNewAST = false;
-                        break;
-                  case 'A':                                     // don't print non-deterministic output
-                        useNewAST = true;
-                        break;
                   case 'P':                                                                             // print options
                         for ( int i = 0;; i += 1 ) {
 …
 static void dump( ast::TranslationUnit && transUnit, ostream & out ) {
         std::list< Declaration * > translationUnit = convert( move( transUnit ) );
+        std::list< Declaration * > translationUnit = convert( std::move( transUnit ) );
         dump( translationUnit, out );
+}

tests/.expect/declarationSpecifier.arm64.txt

rebf8ca5	r23a08aa0
735	735	}
736	736	static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1;
	737	_Thread_local signed int _X3x37i_1;
	738	__thread signed int _X3x38i_1;
737	739	static inline volatile const signed int _X3f11Fi___1();
738	740	static inline volatile const signed int _X3f12Fi___1();

tests/.expect/declarationSpecifier.x64.txt

rebf8ca5	r23a08aa0
735	735	}
736	736	static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1;
	737	_Thread_local signed int _X3x37i_1;
	738	__thread signed int _X3x38i_1;
737	739	static inline volatile const signed int _X3f11Fi___1();
738	740	static inline volatile const signed int _X3f12Fi___1();

tests/.expect/declarationSpecifier.x86.txt

rebf8ca5	r23a08aa0
735	735	}
736	736	static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1;
	737	_Thread_local signed int _X3x37i_1;
	738	__thread signed int _X3x38i_1;
737	739	static inline volatile const signed int _X3f11Fi___1();
738	740	static inline volatile const signed int _X3f12Fi___1();

tests/Makefile.am

-              rebf8ca5
+              r23a08aa0
 # adjust CC to current flags
 CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
+CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ,$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
 CFACC = $(CC)
 …
 # adjusted CC but without the actual distcc call
 CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})
+CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ,$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS})
 CFACCLINK = $(CFACCLOCAL) -quiet $(if $(test), 2> $(test), ) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g'))

tests/concurrent/clib.c

rebf8ca5	r23a08aa0
8	8	}
9	9
10		thread_local struct drand48_data buffer = { 0 };
	10	_Thread_local struct drand48_data buffer = { 0 };
11	11	int myrand() {
12	12	long int result;

tests/concurrent/clib_tls.c

rebf8ca5	r23a08aa0
14	14
15	15
16		~~thread_local~~ int checkval = 0xBAADF00D;
	16	__thread int checkval = 0xBAADF00D;
17	17
18	18	void init(void * ) {

tests/concurrent/park/contention.cfa

rebf8ca5	r23a08aa0
2	2	#include <thread.hfa>
3	3
4		~~thread_local~~ drand48_data buffer = { 0 };
	4	__thread drand48_data buffer = { 0 };
5	5	int myrand() {
6	6	long int result;

tests/config.py.in

rebf8ca5	r23a08aa0
9	9	HOSTARCH = "@host_cpu@"
10	10	DISTRIBUTE = @HAS_DISTCC@
11		~~NEWAST = @DEFAULT_NEW_AST@~~

tests/declarationSpecifier.cfa

-              rebf8ca5
+              r23a08aa0
 //
+//
 // Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
 //
 // The contents of this file are covered under the licence agreement in the
 // file "LICENCE" distributed with Cforall.
 //
 // declarationSpecifier.cfa --
 //
+//
+// declarationSpecifier.cfa --
+//
 // Author           : Peter A. Buhr
 // Created On       : Wed Aug 17 08:21:04 2016
 …
 // Last Modified On : Tue Apr 30 18:20:36 2019
 // Update Count     : 4
 //
+//
 typedef short int Int;
 …
 struct { Int i; } const static volatile x35;
 struct { Int i; } const volatile static x36;
+_Thread_local int x37;
+__thread int x38;
 static inline const volatile int f11();

tests/io/comp_fair.cfa

-              rebf8ca5
+              r23a08aa0
 struct {
       barrier & bar;
       int pipe[2];
+        barrier & bar;
+        int pipe[2];
 } globals;
 …
 thread Reader {};
 void main(Reader & this) {
+      bool do_read = has_user_level_blocking( (fptr_t)async_read );
+        char thrash[1];
+        bool do_read = has_user_level_blocking( (fptr_t)async_read );
+      for(TIMES) {
+            io_future_t f;
+            if ( do_read ) {
+                  char thrash[1];
+                  async_read(f, globals.pipe[0], thrash, 1, 0);
+            } else {
+                  fulfil(f, 0); // If we don't have user-level blocking just play along
+            }
+        for(TIMES) {
+                io_future_t f;
+                if ( do_read ) {
+                        async_read(f, globals.pipe[0], thrash, 1, 0);
+                } else {
+                        fulfil(f, 0); // If we don't have user-level blocking just play along
+                }
             block( globals.bar );
+                block( globals.bar );
                 yield( prng( this, 15 ) );
             unsigned i = __atomic_add_fetch( &counter, 1, __ATOMIC_SEQ_CST );
+                unsigned i = __atomic_add_fetch( &counter, 1, __ATOMIC_SEQ_CST );
                 if(0 == (i % 100)) sout | i;
             wait( f );
+                wait( f );
             if(f.result < 0)
                   abort | "Read error" | -f.result | ":" | strerror(-f.result);
+                if(f.result < 0)
+                        abort | "Read error" | -f.result | ":" | strerror(-f.result);
             block( globals.bar );
+      }
+                block( globals.bar );
+        }
+}
 …
 thread Writer {};
 void main(Writer & this) {
       for(TIMES) {
             block( globals.bar );
+        for(TIMES) {
+                block( globals.bar );
             sleep( 1`us );
+                sleep( 1`us );
             char buf[1] = { '+' };
             int ret = write( globals.pipe[1], buf, 1 );
             if(ret < 0)
                   abort | "Write error" | errno | ":" | strerror(errno);
+                char buf[1] = { '+' };
+                int ret = write( globals.pipe[1], buf, 1 );
+                if(ret < 0)
+                        abort | "Write error" | errno | ":" | strerror(errno);
             block( globals.bar );
+      }
+                block( globals.bar );
+        }
+}
 …
 int main() {
       barrier bar = { 2 };
       &globals.bar = &bar;
       int ret = pipe(globals.pipe);
       if(ret != 0)
             abort | "Pipe error" | errno | ":" | strerror(errno);
+        barrier bar = { 2 };
+        &globals.bar = &bar;
+        int ret = pipe(globals.pipe);
+        if(ret != 0)
+                abort | "Pipe error" | errno | ":" | strerror(errno);
         processor p;
 …
                 Spinner s;
                 Reader ior;
             Writer iow;
+                Writer iow;
+        }
         sout | "done";

tests/meta/.expect/arch.arm64.txt

rebf8ca5	r23a08aa0
1		meta/arch~~Vast~~.cfa:28:1 error: Cannot choose between 3 alternatives for expression
	1	meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression
2	2	Explicit Cast of:
3	3	Name: FA64

tests/meta/.expect/arch.x64.txt

rebf8ca5	r23a08aa0
1		meta/arch~~Vast~~.cfa:28:1 error: Cannot choose between 3 alternatives for expression
	1	meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression
2	2	Explicit Cast of:
3	3	Name: FX64

tests/meta/.expect/arch.x86.txt

rebf8ca5	r23a08aa0
1		meta/arch~~Vast~~.cfa:28:1 error: Cannot choose between 3 alternatives for expression
	1	meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression
2	2	Explicit Cast of:
3	3	Name: FX86

tests/meta/arch.cfa

rebf8ca5	r23a08aa0
5	5	// file "LICENCE" distributed with Cforall.
6	6	//
7		// arch~~Vast.cfa -- Check if all combinations are of ast/arch are properly distinguished~~
	7	// arch.cfa -- Check if all architectures are properly distinguished by the test suite
8	8	//
9	9	// Author : Thierry Delisle

tests/pybin/settings.py

-              rebf8ca5
+              r23a08aa0
                 self.path   = "debug" if value else "nodebug"
-class AST:
-        def __init__(self, ast):
-                if ast == "new":
-                        self.target = ast
-                        self.string = "New AST"
-                        self.flags  = """AST_FLAGS=-XCFA,--new-ast"""
-                elif ast == "old":
-                        self.target = ast
-                        self.string = "Old AST"
-                        self.flags  = """AST_FLAGS=-XCFA,--old-ast"""
-                elif ast == None:
-                        self.target = "new" if config.NEWAST else "old"
-                        self.string = "Default AST (%s)" % self.target
-                        self.flags  = """AST_FLAGS="""
-                else:
-                        print("""ERROR: Invalid ast configuration, must be "old", "new" or left unspecified, was %s""" % (value), file=sys.stderr)
-                        sys.exit(1)
-        def filter(self, tests):
-                return [test for test in tests if not test.astv or self.target == test.astv]
 class Install:
         def __init__(self, value):
 …
 def init( options ):
-        global all_ast
         global all_arch
         global all_debug
         global all_install
-        global ast
         global arch
         global debug
 …
         global timeout2gdb
-        all_ast      = [AST(o)          for o in list(dict.fromkeys(options.ast    ))] if options.ast  else [AST(None)]
         all_arch     = [Architecture(o) for o in list(dict.fromkeys(options.arch   ))] if options.arch else [Architecture(None)]
         all_debug    = [Debug(o)        for o in list(dict.fromkeys(options.debug  ))]

tests/pybin/test_run.py

-              rebf8ca5
+              r23a08aa0
                 self.path = ''
                 self.arch = ''
-                self.astv = ''
         def toString(self):
                 return "{:25s} ({:5s} arch, {:s} ast: {:s})".format( self.name, self.arch if self.arch else "Any", self.astv if self.astv else "Any", self.target() )
+                return "{:25s} ({:5s} arch: {:s})".format( self.name, self.arch if self.arch else "Any", self.target() )
         def prepare(self):
 …
         def expect(self):
                 arch = '' if not self.arch else ".%s" % self.arch
+                astv = '' if not self.astv else ".nast" if self.astv == "new" else ".oast"
+                return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s%s.txt" % (self.name,astv,arch)) )
+                return os.path.normpath( os.path.join(settings.SRCDIR  , self.path, ".expect", "%s%s.txt" % (self.name,arch)) )
         def error_log(self):
 …
         @staticmethod
         def new_target(target, arch, astv):
+        def new_target(target, arch):
                 test = Test()
                 test.name = os.path.basename(target)
                 test.path = os.path.relpath (os.path.dirname(target), settings.SRCDIR)
                 test.arch = arch.target if arch else ''
-                test.astv = astv.target if astv else ''
                 return test

tests/pybin/tools.py

rebf8ca5	r23a08aa0
182	182	'-s' if silent else None,
183	183	test_param,
184		~~settings.ast.flags,~~
185	184	settings.arch.flags,
186	185	settings.debug.flags,

tests/quotedKeyword.cfa

rebf8ca5	r23a08aa0
31	31	``__int128, ``__label__, ``long, ``lvalue, ``_Noreturn, ``__builtin_offsetof, ``otype, ``register, ``restrict,
32	32	``__restrict, ``__restrict__, ``return, ``short, ``signed, ``__signed, ``__signed__, ``sizeof, ``static,
33		``_Static_assert, ``struct, ``switch, ``_Thread_local, ``throw, ``throwResume, ``trait, ``try, ``typedef,
	33	``_Static_assert, ``struct, ``switch, ``_thread, ``_Thread_local, ``throw, ``throwResume, ``trait, ``try, ``typedef,
34	34	``typeof, ``__typeof, ``__typeof__, ``union, ``unsigned, ``__builtin_va_list, ``void, ``volatile, ``__volatile,
35	35	``__volatile__, ``while;

tests/test.py

-              rebf8ca5
+              r23a08aa0
         def match_test(path):
                 match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.nast|\.oast)?(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
+                match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)
                 if match :
                         test = Test()
                         test.name = match.group(2)
                         test.path = match.group(1)
+                        test.arch = match.group(4)[1:] if match.group(4) else None
+                        astv = match.group(3)[1:] if match.group(3) else None
+                        if astv == 'oast':
+                                test.astv = 'old'
+                        elif astv == 'nast':
+                                test.astv = 'new'
+                        elif astv:
+                                print('ERROR: "%s", expect file has astv but it is not "nast" or "oast"' % testname, file=sys.stderr)
+                                sys.exit(1)
+                        test.arch = match.group(3)[1:] if match.group(3) else None
                         expected.append(test)
 …
                                 # this is a valid name, let's check if it already exists
                                 found = [test for test in all_tests if canonical_path( test.target() ) == testname]
                                 setup = itertools.product(settings.all_arch if options.arch else [None], settings.all_ast if options.ast else [None])
+                                setup = itertools.product(settings.all_arch if options.arch else [None])
                                 if not found:
                                         # it's a new name, create it according to the name and specified architecture/ast version
                                         tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
+                                        # it's a new name, create it according to the name and specified architecture
+                                        tests.extend( [Test.new_target(testname, arch) for arch in setup] )
                                 elif len(found) == 1 and not found[0].arch:
                                         # we found a single test, the user better be wanting to create a cross platform test
                                         if options.arch:
                                                 print('ERROR: "%s", test has no specified architecture but --arch was specified, ignoring it' % testname, file=sys.stderr)
-                                        elif options.ast:
-                                                print('ERROR: "%s", test has no specified ast version but --ast was specified, ignoring it' % testname, file=sys.stderr)
                                         else:
                                                 tests.append( found[0] )
                                 else:
                                         # this test is already cross platform, just add a test for each platform the user asked
                                         tests.extend( [Test.new_target(testname, arch, ast) for arch, ast in setup] )
+                                        tests.extend( [Test.new_target(testname, arch) for arch in setup] )
                                         # print a warning if it users didn't ask for a specific architecture
 …
                                                 print('WARNING: "%s", test has architecture specific expected files but --arch was not specified, regenerating only for current host' % testname, file=sys.stderr)
-                                        # print a warning if it users didn't ask for a specific ast version
-                                        found_astv = [f.astv for f in found if f.astv]
-                                        if found_astv and not options.ast:
-                                                print('WARNING: "%s", test has ast version specific expected files but --ast was not specified, regenerating only for current ast' % testname, file=sys.stderr)
                         else :
                                 print('ERROR: "%s", tests are not allowed to end with a C/C++/CFA extension, ignoring it' % testname, file=sys.stderr)
 …
         # create a parser with the arguments for the tests script
         parser = argparse.ArgumentParser(description='Script which runs cforall tests')
-        parser.add_argument('--ast', help='Test for specific ast', type=comma_separated(str), default=None)
         parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None)
         parser.add_argument('--debug', help='Run all tests in debug or release', type=comma_separated(yes_no), default='yes')
 …
                 # print the possible options
                 print("-h --help --debug --dry-run --list --ast=new --ast=old --arch --all --regenerate-expected --archive-errors --install --timeout --global-timeout --timeout-with-gdb -j --jobs -I --include -E --exclude --continue ", end='')
+                print("-h --help --debug --dry-run --list --arch --all --regenerate-expected --archive-errors --install --timeout --global-timeout --timeout-with-gdb -j --jobs -I --include -E --exclude --continue ", end='')
                 print(" ".join(map(lambda t: "%s" % (t.target()), tests)))
 …
         # for each build configurations, run the test
         with Timed() as total_dur:
+                for ast, arch, debug, install in itertools.product(settings.all_ast, settings.all_arch, settings.all_debug, settings.all_install):
+                        settings.ast     = ast
+                for arch, debug, install in itertools.product(settings.all_arch, settings.all_debug, settings.all_install):
                         settings.arch    = arch
                         settings.debug   = debug
 …
                         # filter out the tests for a different architecture
                         # tests are the same across debug/install
+                        local_tests = settings.ast.filter( tests )
+                        local_tests = settings.arch.filter( local_tests )
+                        local_tests = settings.arch.filter( tests )
                         # check the build configuration works
 …
                         # print configuration
                         print('%s %i tests on %i cores (%s:%s - %s)' % (
+                        print('%s %i tests on %i cores (%s - %s)' % (
                                 'Regenerating' if settings.generating else 'Running',
                                 len(local_tests),
                                 jobs,
-                                settings.ast.string,
                                 settings.arch.string,
                                 settings.debug.string

Context Navigation

Legend:

Download in other formats: