Changeset 23a08aa0
- Timestamp:
- Sep 19, 2022, 8:11:02 PM (3 years ago)
- Branches:
- ADT, ast-experimental, master, pthread-emulation
- Children:
- aa9f215
- Parents:
- ebf8ca5 (diff), ae1d151 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - Files:
-
- 3 added
- 47 deleted
- 95 edited
- 36 moved
Legend:
- Unmodified
- Added
- Removed
-
Jenkins/FullBuild
rebf8ca5 r23a08aa0 18 18 19 19 parallel ( 20 // gcc_08_x86_new: { trigger_build( 'gcc-8', 'x86' ) }, 21 // gcc_07_x86_new: { trigger_build( 'gcc-7', 'x86' ) }, 22 // gcc_06_x86_new: { trigger_build( 'gcc-6', 'x86' ) }, 20 gcc_08_x86_new: { trigger_build( 'gcc-10', 'x86' ) }, 21 gcc_07_x86_new: { trigger_build( 'gcc-9', 'x86' ) }, 23 22 gcc_10_x64_new: { trigger_build( 'gcc-10', 'x64' ) }, 24 23 gcc_09_x64_new: { trigger_build( 'gcc-9', 'x64' ) }, -
Jenkinsfile
rebf8ca5 r23a08aa0 150 150 sh 'ulimit -a' 151 151 152 Tools.BuildStage('Test: short', !Settings.RunAllTests) { 152 jopt = '-j $(nproc)' 153 154 Tools.BuildStage('Test: Debug', true) { 153 155 dir (BuildDir) { 154 156 //Run the tests from the tests directory 155 sh " make --no-print-directory -C tests archiveerrors=${BuildDir}/tests/crashes/short"157 sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=yes archiveerrors=${BuildDir}/tests/crashes/full-debug""" 156 158 } 157 159 } 158 160 159 Tools.BuildStage('Test: full', Settings.RunAllTests) {161 Tools.BuildStage('Test: Release', Settings.RunAllTests) { 160 162 dir (BuildDir) { 161 jopt = '-j $(nproc)' 162 if( Settings.Architecture.node == 'x86' ) { 163 jopt = '-j2' 164 } 165 //Run the tests from the tests directory 166 sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=yes archiveerrors=${BuildDir}/tests/crashes/full-debug""" 167 sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=no archiveerrors=${BuildDir}/tests/crashes/full-nodebug""" 163 //Run the tests from the tests directory 164 sh """make ${jopt} --no-print-directory -C tests timeouts="--timeout=600 --global-timeout=14400" all-tests debug=no archiveerrors=${BuildDir}/tests/crashes/full-nodebug""" 168 165 } 169 166 } … … 384 381 ], \ 385 382 [$class: 'BooleanParameterDefinition', \ 386 description: 'If false, only the quick test suite is ran',\383 description: 'If false, the test suite is only ran in debug', \ 387 384 name: 'RunAllTests', \ 388 385 defaultValue: false, \ -
benchmark/basic/tls_fetch_add.c
rebf8ca5 r23a08aa0 7 7 // thread_local Boolean. This means the entire protocol is just to "mov" instructions making it extremely cheap. 8 8 9 #define thread_local _Thread_local 10 11 thread_local volatile bool value; 9 __thread volatile bool value; 12 10 13 11 void __attribute__((noinline)) do_call() { -
benchmark/io/http/worker.cfa
rebf8ca5 r23a08aa0 145 145 if( options.log ) mutex(sout) sout | "=== Accepting connection ==="; 146 146 int fd = cfa_accept4( this.sockfd, this.[addr, addrlen, flags], CFA_IO_LAZY ); 147 if(fd < 0) {147 if(fd <= 0) { 148 148 if( errno == ECONNABORTED ) break; 149 149 if( this.done && (errno == EINVAL || errno == EBADF) ) break; 150 abort( "accept error : (%d) %s\n", (int)errno, strerror(errno) );150 abort( "accept error %d: (%d) %s\n", fd, (int)errno, strerror(errno) ); 151 151 } 152 152 if(this.done) break; -
configure.ac
rebf8ca5 r23a08aa0 24 24 #Trasforming cc1 will break compilation 25 25 M4CFA_PROGRAM_NAME 26 27 #==============================================================================28 # New AST toggling support29 AH_TEMPLATE([CFA_USE_NEW_AST],[Sets whether or not to use the new-ast, this is adefault value and can be overrided by --old-ast and --new-ast])30 DEFAULT_NEW_AST="True"31 AC_ARG_ENABLE(new-ast,32 [ --enable-new-ast whether or not to use new ast as the default AST algorithm],33 [case "${enableval}" in34 yes) newast=true ; DEFAULT_NEW_AST="True" ;;35 no) newast=false; DEFAULT_NEW_AST="False" ;;36 *) AC_MSG_ERROR([bad value ${enableval} for --enable-new-ast]) ;;37 esac],[newast=true])38 AC_DEFINE_UNQUOTED([CFA_USE_NEW_AST], $newast)39 AC_SUBST(DEFAULT_NEW_AST)40 26 41 27 #============================================================================== … … 139 125 \'--enable-gprofiler=*) ;; 140 126 \'--disable-gprofiler) ;; 141 142 # skip the target hosts143 \'--enable-new-ast=*) ;;144 \'--disable-new-ast) ;;145 127 146 128 # skip this, it only causes problems -
doc/LaTeXmacros/lstlang.sty
rebf8ca5 r23a08aa0 118 118 inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or, 119 119 otype, restrict, __restrict, __restrict__, recover, report, __signed, __signed__, _Static_assert, suspend, 120 thread, _ Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__,120 thread, __thread, _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__, 121 121 virtual, __volatile, __volatile__, waitfor, when, with, zero_t, 122 122 }, -
doc/bibliography/pl.bib
rebf8ca5 r23a08aa0 3757 3757 series = {Innovative Technology}, 3758 3758 year = 1991, 3759 } 3760 3761 @mastersthesis{Zulfiqar22, 3762 keywords = {Cforall, memory allocation, threading}, 3763 contributer = {pabuhr@plg}, 3764 author = {Mubeen Zulfiqar}, 3765 title = {High-Performance Concurrent Memory Allocation}, 3766 school = {School of Computer Science, University of Waterloo}, 3767 year = 2022, 3768 address = {Waterloo, Ontario, Canada, N2L 3G1}, 3769 note = {\href{https://uwspace.uwaterloo.ca/handle/10012/18329}{https://\-uwspace.uwaterloo.ca/\-handle/\-10012/18329}}, 3759 3770 } 3760 3771 -
doc/proposals/iterators.md
rebf8ca5 r23a08aa0 58 58 returns a range object, which can be used as any other type. 59 59 60 It might not cover every single case with the same syntax (the `@` syntax may 61 not translate to operators very well), but should be able to maintain every 62 option with some library range. 63 60 64 Library Enhancements 61 65 -------------------- … … 82 86 ------------ 83 87 Python has a robust iterator tool set. It also has a `range` built-in which 84 does many of the same things as the special for loops. 88 does many of the same things as the special for loops (the finite and 89 half-open ranges). 90 91 In addition, it has many dedicated iterator constructors and transformers, 92 and many containers can both produce and be constructed from iterators. 85 93 86 94 + https://docs.python.org/3/reference/datamodel.html#object.__iter__ 87 95 + https://docs.python.org/3/library/functions.html#func-range 88 96 89 C++ has many iterator tools at well, except for the fact it's `iterators`are97 C++ has many iterator tools at well, except for the fact it's "iterators" are 90 98 not what are usually called iterators (as above) but rather an abstraction of 91 pointers. 99 pointers. The notable missing feature is that a single iterator has no 100 concept of being empty or not, instead it must be compared to the end 101 iterator. 102 103 However, C++ ranges have an interface much more similar to iterators. 104 They do appear to be a wrapper around the "pointer" iterators. 105 106 + https://en.cppreference.com/w/cpp/ranges 92 107 93 108 Rust also has a imperative implementation of a functional style of iterators, -
doc/theses/thierry_delisle_PhD/.gitignore
rebf8ca5 r23a08aa0 20 20 thesis/fig/*.fig.bak 21 21 thesis/thesis.pdf 22 thesis/thesis.tty 22 23 thesis/thesis.ps 23 24 -
doc/theses/thierry_delisle_PhD/thesis/Makefile
rebf8ca5 r23a08aa0 11 11 LaTeX = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error -output-directory=${Build} 12 12 BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex 13 DeTeX = TEXINPUTS=${TeXLIB} && export TEXINPUTS && detex -r 13 14 14 15 MAKEFLAGS = --no-print-directory # --silent … … 144 145 ${LaTeX} $< 145 146 147 %.tty: build/%.dvi 148 dvi2tty -w132 $< > $@ 149 146 150 ## Define the default recipes. 147 151 … … 190 194 churn_jax_ops_FLAGS = --MaxY=50000000 191 195 churn_low_jax_ops_FLAGS = --MaxY=50000000 192 churn_jax_ns_FLAGS = --MaxY= 20000193 churn_low_jax_ns_FLAGS = --MaxY= 20000196 churn_jax_ns_FLAGS = --MaxY=10000 197 churn_low_jax_ns_FLAGS = --MaxY=10000 194 198 195 199 churn_nasus_ops_FLAGS = --MaxY=75000000 196 200 churn_low_nasus_ops_FLAGS = --MaxY=75000000 197 churn_nasus_ns_FLAGS = --MaxY=20000 198 churn_low_nasus_ns_FLAGS = --MaxY=20000 201 churn_nasus_ns_FLAGS = --MaxY=5000 202 churn_low_nasus_ns_FLAGS = --MaxY=5000 203 204 locality_share_jax_ops_FLAGS = --MaxY=40000000 205 locality_noshare_jax_ops_FLAGS = --MaxY=40000000 206 locality_share_jax_ns_FLAGS = --MaxY=10000 207 locality_noshare_jax_ns_FLAGS = --MaxY=10000 208 209 locality_share_nasus_ops_FLAGS = --MaxY=60000000 210 locality_noshare_nasus_ops_FLAGS = --MaxY=60000000 211 locality_share_nasus_ns_FLAGS = --MaxY=10000 212 locality_noshare_nasus_ns_FLAGS = --MaxY=10000 199 213 200 214 build/result.%.ns.svg : data/% Makefile ../../../../benchmark/plot.py | ${Build} -
doc/theses/thierry_delisle_PhD/thesis/glossary.tex
rebf8ca5 r23a08aa0 14 14 % Definitions 15 15 16 \longnewglossaryentry{ thrd}17 {name={ thread}}16 \longnewglossaryentry{at} 17 {name={Thread},text={thread}} 18 18 { 19 Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.19 A thread is an independent sequential execution path through a program. Each thread is scheduled for execution separately and independently from other threads. Systems offer one or more concrete implementations of this concept, \eg \gls{kthrd}, \gls{job}, task. However, most of the concepts of scheduling are independent of the particular implementations of the thread representation. For this reason, this document uses the term \gls{at} to mean any of these representation that meets the general definition. 20 20 21 \textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}21 \textit{Synonyms : Tasks, Jobs, Blocks.} 22 22 } 23 23 24 24 \longnewglossaryentry{proc} 25 {name={ processor}}25 {name={Processor},text={processor}} 26 26 { 27 Entity that executes a \gls{at}, \ie the resource being scheduled by the scheduler. In kernel-level threading, \ats are kernel threads and \procs are the \glspl{hthrd} on which the kernel threads are scheduled. In user-level threading and thread pools, \procs are kernel threads. 27 28 29 \textit{Synonyms : Server, Worker.} 28 30 } 29 31 30 32 \longnewglossaryentry{rQ} 31 {name={ ready-queue}}33 {name={Ready Queue}, text={ready-queue}} 32 34 { 33 35 Data structure holding \ats that are ready to \glslink{atrun}{run}. Often a \glsxtrshort{fifo} queue for fairness, but can take many different forms, \eg binary tree and priority queue are also common. 34 36 } 35 37 36 38 \longnewglossaryentry{uthrding} 37 {name={ user-level threading}}39 {name={User-Level Threading},text={user-level threading}} 38 40 { 39 41 Threading model where a scheduler runs in users space and maps threads managed and created inside the user-space onto \glspl{kthrd}. 40 42 41 43 \textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.} … … 43 45 44 46 \longnewglossaryentry{rmr} 45 {name={ remote memory reference}}47 {name={Remote Memory Reference},text={remote memory reference}} 46 48 { 47 49 A memory reference to an address not in the current \gls{hthrd}'s cache is a remote reference. Memory references that \emph{are} in the current \gls{hthrd}'s cache is a \newterm{local} memory reference. For example, a cache line that must be updated from the any cache on another socket, or from RAM in a \glsxtrshort{numa} context. 48 50 } 49 51 … … 51 53 52 54 \longnewglossaryentry{hthrd} 53 {name={ hardware thread}}55 {name={Hardware Threading},text={hardware thread}} 54 56 { 55 Threads representing the underlying hardware directly, \eg the CPU core, or hyper-thread if the hardware supports multiple threads of execution per core. The number of hardware threads is considered to be always fixed to a specific number determined by the hardware.57 Threads representing the underlying hardware, \eg a CPU core or hyper-thread, if the hardware supports multiple threads of execution per core. The number of hardware threads present is fixed on any given computer. 56 58 57 \textit{Synonyms : }59 \textit{Synonyms : Core, Hyper-Thread, Processing Unit, CPU.} 58 60 } 59 61 60 62 \longnewglossaryentry{kthrd} 61 {name={ kernel-level thread}}63 {name={Kernel-Level Thread},text={kernel-level thread}} 62 64 { 63 Threads created and managed inside kernel -space. Eachthread has its own stack and its own thread of execution. Kernel-level threads are owned, managed and scheduled by the underlying operating system.65 Threads created and managed inside kernel space. Each kernel thread has its own stack and its own thread of execution. Kernel-level threads are owned, managed and scheduled by the underlying operating system. 64 66 65 67 \textit{Synonyms : OS threads, Hardware threads, Physical threads.} … … 67 69 68 70 \longnewglossaryentry{fiber} 69 {name={ fiber}}71 {name={Fiber},text={fiber}} 70 72 { 71 Fibers are non-preemptive user-level threads. They share most of the c aracteristics of user-level threads except that they cannot be preempted by another fiber.73 Fibers are non-preemptive user-level threads. They share most of the characteristics of user-level threads except that they cannot be preempted by another fiber. 72 74 73 75 \textit{Synonyms : Tasks.} … … 75 77 76 78 \longnewglossaryentry{job} 77 {name={ job}}79 {name={Job},text={job}} 78 80 { 79 81 Unit of work, often sent to a thread pool or worker pool to be executed. Has neither its own stack nor its own thread of execution. … … 83 85 84 86 \longnewglossaryentry{pool} 85 {name={ thread-pool}}87 {name={Thread Pool},text={thread-pool}} 86 88 { 87 Group of homogene uous threads that loop executing units of works after another.89 Group of homogeneous threads that loop executing units of works. Often executing \glspl{jobs}. 88 90 89 \textit{Synonyms : }91 \textit{Synonyms : Executor.} 90 92 } 91 93 92 94 \longnewglossaryentry{preemption} 93 {name={ preemption}}95 {name={Preemption},text={preemption}} 94 96 { 95 97 Involuntary context switch imposed on threads at a given rate. … … 98 100 } 99 101 100 101 102 \longnewglossaryentry{at}103 {name={task}}104 {105 Abstract object representing an unit of work. Systems will offer one or more concrete implementations of this concept (\eg \gls{kthrd}, \gls{job}), however, most of the concept of schedulings are independent of the particular implementations of the work representation. For this reason, this document use the term \Gls{at} to mean any representation and not one in particular.106 }107 108 102 \longnewglossaryentry{atsched} 109 103 {name={Scheduling a \gls{at}}} 110 104 { 111 Scheduling a n \gls{at} refers to the act of notifying the scheduler that a task is ready to be ran. When representing the scheduler as a queue of tasks, scheduling is the act of pushing a task onto the end of the queue. This doesn't necesserily means the task will ever be allocated CPU time (\gls{atrun}), for example, if the system terminates abruptly, scheduled \glspl{at} will probablynever run.105 Scheduling a \at refers to notifying the scheduler that a \at is ready to run. When representing the scheduler as a queue of \ats, scheduling is the act of pushing a \at onto the end of the queue. This operation does not necessarily mean the \at is guaranteed CPU time (\gls{atrun}), \eg if the program terminates abruptly, scheduled \glspl{at} never run. 112 106 113 \textit{Synonyms : None.}107 \textit{Synonyms : Unparking.} 114 108 } 115 109 … … 117 111 {name={Running a \gls{at}}} 118 112 { 119 Running a n \gls{at} refers to the act of allocating CPU time to a task that is ready to run. When representing the scheduler as a queue of tasks, running is the act of poping a task from the front of the queue and putting it onto a \gls{proc}. The \gls{at} can than accomplish some or all of the work it is programmed to do.113 Running a \at refers to allocating CPU time to a \at that is ready to run. When representing the scheduler as a queue of \ats, running is the act of popping a \at from the front of the queue and putting it onto a \gls{proc}. The \gls{at} can then accomplish some or all of the work it is programmed to do. 120 114 121 115 \textit{Synonyms : None.} … … 123 117 124 118 \longnewglossaryentry{atmig} 125 {name={ migration of \gls{at}}}119 {name={\Glspl{at} Migration}} 126 120 { 127 Migration refers to the idea of an \gls{at} running on a different worker/processor than the last time it was run. It is generally preferable to minimise migration as it incurs cost but any load balancing among workersrequires some amount of migration.121 Migration refers to the idea of an \gls{at} running on a different \proc than the last time it was run. It is generally preferable to minimize migration as it incurs cost but any load balancing among \proc requires some amount of migration. 128 122 129 123 \textit{Synonyms : None.} … … 131 125 132 126 \longnewglossaryentry{atpass} 133 {name={ overtaking \gls{at}}}127 {name={Overtaking \gls{at}}} 134 128 { 135 129 When representing the scheduler as a queue of \glspl{at}, overtaking is the act breaking the FIFO-ness of the queue by moving a \gls{at} in front of some other \gls{at} when it arrived after. This remains true for schedulers that do not use a FIFO queue, when the order in which the \glspl{at} are \glslink{atsched}{scheduled} and \glslink{atrun}{run} in a different order. A \gls{at} is said to \emph{overtake} another if it is run \emph{before} but was \emph{scheduled} after the other \gls{at}. … … 139 133 140 134 \longnewglossaryentry{atblock} 141 {name={ Blocking an \gls{at}}}135 {name={\Gls{at} Blocking}} 142 136 { 143 Blocking an abstract task refers to the act of taking a task that us running on a CPU off the CPU. Unless no other task is ready, this action is generally immediately followed by running an other task.137 \Gls{at} blocking means taking a running \at off a CPU. Unless no other \at is ready, this action is immediately followed by running another \at. 144 138 145 \textit{Synonyms : None.}139 \textit{Synonyms : Parking.} 146 140 } 147 141 … … 149 143 {name={Running to completion}} 150 144 { 151 Running to completion refers to the entire sequence of : being scheduled, running and blocking, for a given task.145 Running to completion refers to the entire sequence of : being scheduled, running and blocking, for a given \at. 152 146 153 147 See also \gls{atsched}, \gls{atrun}, \gls{atblock} … … 157 151 158 152 \longnewglossaryentry{load} 159 {name={System Load} }153 {name={System Load},text={load}} 160 154 { 161 The load is refers to the rate at which \glspl{at} are \glslink{atsched}{scheduled} versus the rate at which they are \glslink{atrun}{run}. When \glspl{at} are being scheduled faster than they are run, the system is considered \emph{overloaded}. When \glspl{at} are being run faster than they are scheduled, the system is considered \emph{underloaded}. Conrrespondingly, if both rates are equal, the system is considered \emph{loaded}. Note that the system is considered loaded only of the rate at which \glspl{at} are scheduled/run is non-zero, otherwise the system is empty, it has no load. 155 The system load refers to the rate at which \glspl{at} are \glslink{atsched}{scheduled} versus the rate at which they are \glslink{atrun}{run}. When \glspl{at} are being scheduled faster than they are run, the system is considered \emph{overloaded}. When \glspl{at} are being run faster than they are scheduled, the system is considered \emph{underloaded}. Correspondingly, if both rates are equal, the system is considered \emph{loaded}. Note the system is considered loaded only if the rate at which \glspl{at} are scheduled/run is non-zero, otherwise the system is empty, \ie it has no load. 156 157 \textit{Synonyms : CPU Load, System Load.} 162 158 } 163 159 -
doc/theses/thierry_delisle_PhD/thesis/local.bib
rebf8ca5 r23a08aa0 458 458 } 459 459 460 460 % Trevor's relaxed FIFO list 461 @inproceedings{alistarh2018relaxed, 462 title={Relaxed schedulers can efficiently parallelize iterative algorithms}, 463 author={Alistarh, Dan and Brown, Trevor and Kopinsky, Justin and Nadiradze, Giorgi}, 464 booktitle={Proceedings of the 2018 ACM Symposium on Principles of Distributed Computing}, 465 pages={377--386}, 466 year={2018} 467 } 468 469 @article{zhuravlev2012survey, 470 title={Survey of energy-cognizant scheduling techniques}, 471 author={Zhuravlev, Sergey and Saez, Juan Carlos and Blagodurov, Sergey and Fedorova, Alexandra and Prieto, Manuel}, 472 journal={IEEE Transactions on Parallel and Distributed Systems}, 473 volume={24}, 474 number={7}, 475 pages={1447--1464}, 476 year={2012}, 477 publisher={IEEE} 478 } 479 480 @article{vikranth2013topology, 481 title={Topology aware task stealing for on-chip NUMA multi-core processors}, 482 author={Vikranth, BRWACRR and Wankar, Rajeev and Rao, C Raghavendra}, 483 journal={Procedia Computer Science}, 484 volume={18}, 485 pages={379--388}, 486 year={2013}, 487 publisher={Elsevier} 488 } 489 490 @inproceedings{min2011hierarchical, 491 title={Hierarchical work stealing on manycore clusters}, 492 author={Min, Seung-Jai and Iancu, Costin and Yelick, Katherine}, 493 booktitle={Fifth Conference on Partitioned Global Address Space Programming Models (PGAS11)}, 494 volume={625}, 495 year={2011}, 496 organization={Citeseer} 497 } 498 499 @article{ribic2014energy, 500 title={Energy-efficient work-stealing language runtimes}, 501 author={Ribic, Haris and Liu, Yu David}, 502 journal={ACM SIGARCH Computer Architecture News}, 503 volume={42}, 504 number={1}, 505 pages={513--528}, 506 year={2014}, 507 publisher={ACM New York, NY, USA} 508 } 509 510 @inproceedings{torng2016asymmetry, 511 title={Asymmetry-aware work-stealing runtimes}, 512 author={Torng, Christopher and Wang, Moyang and Batten, Christopher}, 513 booktitle={2016 ACM/IEEE 43rd Annual International Symposium on Computer Architecture (ISCA)}, 514 pages={40--52}, 515 year={2016}, 516 organization={IEEE} 517 } 461 518 462 519 % -------------------------------------------------- … … 555 612 title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide}, 556 613 organization = {Apple Inc.}, 557 howPublish = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}} 614 note = {\href{https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}{https://\-developer.apple.com/\-library/archive/\-documentation/\-Darwin/\-Conceptual/\-KernelProgramming/\-scheduler/\-scheduler.html}} 615 } 616 617 @misc{MemcachedThreading, 618 author = {Oracle}, 619 title = {MySQL 5.6 Reference Manual Including MySQL NDB Cluster 7.3-7.4 Reference Guide}, 620 howpublished = {\href{https://docs.oracle.com/cd/E17952_01/mysql-5.6-en/ha-memcached-using-threads.html}{https://docs.oracle.com/\-cd/E17952\_01/\-mysql-5.6-en/\-ha-memcached-using-threads.html}}, 621 note = "[Online; accessed 5-August-2022]" 558 622 } 559 623 … … 650 714 } 651 715 716 @misc{GITHUB:SchedulingBenchmarks, 717 title = {Scheduling Benchmarks}, 718 author = {Thierry Delisle}, 719 howpublished = {\href{https://github.com/cforall/SchedulingBenchmarks_PhD22}{https://\-github.com/\-cforall/\-SchedulingBenchmarks\_\-PhD22}}, 720 } 721 652 722 % -------------------------------------------------- 653 723 % Tech documents … … 758 828 } 759 829 830 @manual{MAN:eventfd, 831 key = "eventfd", 832 title = "eventfd(2) Linux User's Manual", 833 year = "2019", 834 month = "MArch", 835 } 836 760 837 @manual{MAN:aio, 761 838 key = "aio", … … 763 840 year = "2019", 764 841 month = "March", 842 } 843 844 @manual{MAN:bash, 845 title = {Bash Reference Manual}, 846 author = {Chet Ramey and Brian Fox}, 847 year = "2020", 848 month = "December", 849 version = {5,1}, 850 howpublished = {\href{https://www.gnu.org/software/bash/manual/bash.pdf}{https://\-www.gnu.org/\-software/\-bash/\-manual/\-bash.pdf}} 765 851 } 766 852 … … 774 860 } 775 861 862 776 863 % -------------------------------------------------- 777 864 % Wikipedia Entries … … 870 957 howpublished = "\href{https://en.wikipedia.org/wiki/Zipf%27s_law}{https://\-en.wikipedia.org/\-wiki/\-Zipf\%27s\-\_law}", 871 958 note = "[Online; accessed 5-August-2022]" 959 } 960 961 @misc{wiki:htm, 962 author = "{Wikipedia contributors}", 963 title = "Transactional memory --- {W}ikipedia{,} The Free Encyclopedia", 964 year = "2022", 965 howpublished = "\href{https://en.wikipedia.org/wiki/Zipf%27s_law}{https://\-en.wikipedia.org/\-wiki/\-Zipf\%27s\-\_law}", 966 note = "[Online; accessed 7-September-2022]" 872 967 } 873 968 … … 991 1086 note = "[Online; accessed 5-August-2022]" 992 1087 } 1088 1089 @article{reese2008nginx, 1090 title = {NGINX: the high-performance web server and reverse proxy}, 1091 author = {Reese, Will}, 1092 journal = {Linux Journal}, 1093 volume = {2008}, 1094 number = {173}, 1095 pages = {2}, 1096 year = {2008}, 1097 publisher = {Belltown Media} 1098 } 1099 1100 @phdthesis{Harji10, 1101 author = {Ashif Harji}, 1102 title = {Performance Comparison of Uniprocessor and Multiprocessor Web Server Architectures}, 1103 school = {University of Waterloo}, 1104 year = 2010, 1105 month = feb, 1106 address = {Waterloo, Ontario, Canada, N2L 3G1}, 1107 note = {\textsf{http://uwspace.uwaterloo.ca/\-bitstream/\-10012/\-5040/\-1/\-Harji\_thesis.pdf}}, 1108 } -
doc/theses/thierry_delisle_PhD/thesis/text/conclusion.tex
rebf8ca5 r23a08aa0 1 1 \chapter{Conclusion}\label{conclusion} 2 2 3 \Gls{uthrding} is popular. 4 It makes sense for \CFA to use it. 3 Building the \CFA runtime has been a challenging project. 4 The work was divided between high-level concurrency design and a user-level threading runtime (Masters' thesis), and low-level support of the user-level runtime using OS kernel threading and its (multiple) I/O subsystems (Ph.D. thesis). 5 Because I am the main developer for both components of this project, there is strong continuity across the design and implementation. 6 This continuity provides a consistent approach to advanced control flow and concurrency, with easier development, management and maintenance of the runtime in the future. 5 7 6 \todo{Obivously fix the above} 8 I believed my Masters' work would provide the background to make the Ph.D. work reasonably straightforward. 9 However, I discovered two significant challenges. 7 10 8 An important aspect of this approach to threading is how threads are scheduled. 9 As \CFA aims to increase productivity and safety of C while maintaining its performance, so to should the threading runtime achieve these goals. 10 For scheduling, productivity and safety manifest in removing pitfalls in the efficient usage of the threading runtime. 11 This thesis contributes to this goal by presenting a low-latency scheduler that offers improved starvation prevention compared to other state-of-the-art schedulers. 12 It presents a core algorithm (Chapter~\ref{core}) that provides increased fairness through helping (Section~\ref{heling}) as well as optimizations which virtually remove the cost of this fairness (Section~\ref{relaxedtimes}). 13 Building upon the fundamental scheduling algorithm, an implementation of user-level \io blocking is presented (Chapter~\ref{io}) which achieves the same performance and fairness balance as the scheduler itself. 14 From these core algorithms, and a low-latency idle-sleep mechanism is presented (Chapter~\ref{practice}) which allows the \CFA runtime to stay viable for workloads that do not consistently saturate the system. 11 First, modern symmetric multiprocessing CPUs have significant performance penalties for communication, often cache-related. 12 An SQMS scheduler (see Section~\ref{sched}), with its \proc-shared ready-queue, has perfect load-balancing but poor affinity resulting in high communication across \procs. 13 An MQMS scheduler, with its \proc-specific ready-queues, has poor load-balancing but perfect affinity often resulting in significantly reduced communication. 14 However, implementing fairness for an MQMS scheduler is difficult, since fairness requires \procs to be aware of each other's ready-queue progress, \ie communicated knowledge. 15 For balanced workloads with little or no data sharing, \ie embarrassingly parallel, an MQMS scheduler is near optimal, \eg a state-of-the-art work-stealing scheduler. 16 For these kinds of fair workloads, adding fairness must be low-cost to hide the communication costs needed for global ready-queue progress or performance suffers. 17 While I was aware of these realities, I underestimated how little performance margin there is for communication. 18 Several of my attempts at building a fair scheduler compared poorly to work-stealing schedulers because of the thin communication margin. 19 20 Second, the kernel locking, threading, and I/O in the Linux operating system offer very little flexibility and are not designed to facilitate user-level threading. 21 There are multiple concurrency aspects in Linux that require carefully following a strict procedure to achieve acceptable performance. 22 To be fair, many of these concurrency aspects were designed 30-40 years ago, when there were few multiprocessor computers and concurrency knowledge was just developing. 23 Unfortunately, little has changed in the intervening years. 24 25 Also, my decision to use @io_uring@ was both positive and negative. 26 The positive is that @io_uring@ supports the panoply of I/O mechanisms in Linux; 27 hence, the \CFA runtime uses one I/O mechanism to provide non-blocking I/O, rather than using @select@ to handle TTY I/O, @epoll@ to handle network I/O, and managing a thread pool to handle disk I/O. 28 Merging all these different \io mechanisms into a coherent scheduling implementation would require much more work than what is present in this thesis, as well as detailed knowledge of multiple I/O mechanisms. 29 The negative is that @io_uring@ is new and developing. 30 As a result, there is limited documentation, few places to find usage examples, and multiple errors that required workarounds. 31 32 Given what I now know about @io_uring@, I would say it is insufficiently coupled with the Linux kernel to properly handle non-blocking I/O. 33 It does not seem to reach deep into the kernel's handling of \io, and as such it must contend with the same realities that users of @epoll@ must contend with. 34 Specifically, in cases where @O_NONBLOCK@ behaves as desired, operations must still be retried. 35 Preserving the illusion of asynchronicity requires delegating these operations to kernel threads. 36 This requirement is also true of cases where @O_NONBLOCK@ does not prevent blocking. 37 Spinning up internal kernel threads to handle blocking scenarios is what developers already do outside of the kernel, and managing these threads adds a significant burden to the system. 38 Nonblocking I/O should not be handled in this way. 39 40 \section{Goals} 41 This work focuses on efficient and fair scheduling of the multiple CPUs, which are ubiquitous on all modern computers. 42 The levels of indirection to the CPUs are: 43 \begin{itemize} 44 \item 45 The \CFA presentation of concurrency through multiple high-level language constructs. 46 \item 47 The OS presentation of concurrency through multiple kernel threads within an application. 48 \item 49 The OS and library presentation of disk and network I/O, and many secondary library routines that directly and indirectly use these mechanisms. 50 \end{itemize} 51 The key aspect of all of these mechanisms is that control flow can block, which immediately hinders any level above from making scheduling decisions as a result. 52 Fundamentally, scheduling needs to understand all the mechanisms used by threads that affect their state changes. 53 54 The underlying goal of this thesis is scheduling the complex hardware components that make up a computer to provide good utilization and fairness. 55 However, direct hardware scheduling is only possible in the OS. 56 Instead, this thesis is performing arms-length application scheduling of the hardware components through a set of OS interfaces that indirectly manipulate the hardware components. 57 This can quickly lead to tensions when the OS interface has different use cases in mind. 58 59 As \CFA aims to increase productivity and safety of C, while maintaining its performance, this places a huge burden on the \CFA runtime to achieve these goals. 60 Productivity and safety manifest in removing scheduling pitfalls in the efficient usage of the threading runtime. 61 Performance manifests in making efficient use of the underlying kernel threads that provide indirect access to the CPUs. 62 63 This thesis achieves its stated contributions by presenting: 64 \begin{enumerate}[leftmargin=*] 65 \item 66 A scalable low-latency scheduler that offers improved starvation prevention (progress guarantee) compared to other state-of-the-art schedulers, including NUMA awareness. 67 \item 68 The scheduler demonstrates a core algorithm that provides increased fairness through helping, as well as optimizations which virtually remove the cost of this fairness. 69 \item 70 An implementation of user-level \io blocking is incorporated into the scheduler, which achieves the same performance and fairness balance as the scheduler itself. 71 \item 72 These core algorithms are further extended with a low-latency idle-sleep mechanism, which allows the \CFA runtime to stay viable for workloads that do not consistently saturate the system. 73 \end{enumerate} 74 Finally, the complete scheduler is fairly simple with low-cost execution, meaning the total cost of scheduling during thread state changes is low. 15 75 16 76 \section{Future Work} 17 While the \CFA runtime achieves a better compromise in term of performance and fairness than other schedulers, I do believe that further improvements could be made to reduce even further the number of cases where performance deteriorates.18 Fu rthermore, I believe that achieve performance and starvation freedom simultaneously is generally a challengeeven outside of scheduling algorithms.77 While the \CFA runtime achieves a better compromise than other schedulers, in terms of performance and fairness, I believe further improvements can be made to reduce or eliminate the few cases where performance does deteriorate. 78 Fundamentally, achieving performance and starvation freedom will always be goals with opposing needs even outside of scheduling algorithms. 19 79 20 80 \subsection{Idle Sleep} 21 A difficult challenge that was not fully address in this thesis is idle-sleep.22 While a correct and somewhat low-cost idle-sleep mechanism was presented, several of the benchmarks show notable performance degradation when too few \ats are present in the system.81 A difficult challenge, not fully addressed in this thesis, is idle sleep. 82 While a correct and somewhat low-cost idle-sleep mechanism is presented, several of the benchmarks show notable performance degradation when too few \ats are present in the system. 23 83 The idle sleep mechanism could therefore benefit from a reduction of spurious cases of sleeping. 24 84 Furthermore, this thesis did not present any heuristic for when \procs should be put to sleep and when \procs should be woken up. 25 It is especially worth noting that relaxed timestamps and topology aware helping lead to notable improvements in performance. 26 Neither of these techniques were used for the idle sleep mechanism. 85 While relaxed timestamps and topology awareness made notable performance improvements, neither of these techniques are used for the idle-sleep mechanism. 27 86 28 There are opportunities where these techniques could be use: 29 The mechanism uses a hand-shake between notification and sleep to ensure that no \at is missed. 30 The correctness of that hand-shake is cirtical when the last \proc goes to sleep but could be relaxed when several \procs are awake. 31 Furthermore, organizing the sleeping \procs as a LIDO stack makes sense to keep cold \procs as cold as possible, but it might be more appropriate to attempt to keep cold CPU sockets instead. 32 33 However, using these techniques could require significant investigation. 34 For example, keeping a CPU socket cold might be appropriate for power consumption reasons but can affect overall memory bandwith. 35 The balance between these is not necessarily obvious. 87 Here are opportunities where these techniques could be used: 88 \begin{itemize} 89 \item 90 The mechanism uses a handshake between notification and sleep to ensure that no \at is missed. 91 \item 92 The handshake correctness is critical when the last \proc goes to sleep but could be relaxed when several \procs are awake. 93 \item 94 Furthermore, organizing the sleeping \procs as a LIFO stack makes sense to keep cold \procs as cold as possible, but it might be more appropriate to attempt to keep cold CPU sockets instead. 95 \end{itemize} 96 However, using these techniques would require significant investigation. 97 For example, keeping a CPU socket cold might be appropriate for power consumption reasons but can affect overall memory bandwidth. 98 The balance between these approaches is not obvious. 99 I am aware there is a host of low-power research that could be tapped here. 36 100 37 101 \subsection{Hardware} 38 One challenge that needed to be overcome for this thesis was that the modern x86-64 hasvery few tools to implement fairness.39 \Glspl{proc} attempting to help each other inherently cause cache-coherence traffic.102 One challenge that needed to be overcome for this thesis is that the modern x86-64 processors have very few tools to implement fairness. 103 \Glspl{proc} attempting to help each other inherently cause cache-coherence traffic. 40 104 However, as mentioned in Section~\ref{helping}, relaxed requirements mean this traffic is not necessarily productive. 41 105 In cases like this one, there is an opportunity to improve performance by extending the hardware. 42 106 43 Many different extensions would be suitable here.44 For example, when attempting to read remote timestamps when deciding to whether or not to help, it could be useful to allow cancelling the remote read if it will leadto significant latency.45 If the latency is due to a recent cache invalidation, it is unlikely th at the timestamp is old and that helping will beneeded.107 Many different extensions are suitable here. 108 For example, when attempting to read remote timestamps for helping, it would be useful to allow cancelling the remote read if it leads to significant latency. 109 If the latency is due to a recent cache invalidation, it is unlikely the timestamp is old and that helping is needed. 46 110 As such, simply moving on without the result is likely to be acceptable. 47 Another option would be to attemptto read multiple memory addresses and only wait for \emph{one of} these reads to retire.48 This would have a similar effect, where cache-lines with more traffic would be waited on less often.49 In both of these examples, some care would probably be needed to make sure that thereads to an address \emph{sometimes} retire.111 Another option is to read multiple memory addresses and only wait for \emph{one of} these reads to retire. 112 This approach has a similar effect, where cache lines with more traffic are waited on less often. 113 In both of these examples, some care is needed to ensure that reads to an address \emph{sometimes} retire. 50 114 51 Note that this i s similar to the feature \newterm{Hardware Transactional Memory}~\cite{HTM}, which allows groups of instructions to be aborted and rolled-back if they encounter memory conflicts when being retired.115 Note that this idea is similar to \newterm{Hardware Transactional Memory}~\cite{wiki:htm}, which allows groups of instructions to be aborted and rolled back if they encounter memory conflicts when being retired. 52 116 However, I believe this feature is generally aimed at large groups of instructions. 53 A more fine-grained approach may be more amenable tocarefully picking which aspects of an algorithm require exact correctness and which do not.117 A more fine-grained approach may be more amenable by carefully picking which aspects of an algorithm require exact correctness and which do not. -
doc/theses/thierry_delisle_PhD/thesis/text/core.tex
rebf8ca5 r23a08aa0 2 2 3 3 Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenario, where the system has reached a steady state. 4 For this purpose, a steady state is loosely defined as a state where there are always \ glspl{thrd}ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers.4 For this purpose, a steady state is loosely defined as a state where there are always \ats ready to run and the system has the resources necessary to accomplish the work, \eg, enough workers. 5 5 In short, the system is neither overloaded nor underloaded. 6 6 7 7 It is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. 8 As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new loadand return to the steady state, \eg, by adding or removing workers.8 As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to this new \gls{load} and return to the steady state, \eg, by adding or removing workers. 9 9 Therefore, flaws in scheduling the steady state tend to be pervasive in all states. 10 10 11 11 \section{Design Goals} 12 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental -model.13 To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental -model, the system also respects this model.14 15 For threading, a simple and common execution mental -model is the ``Ideal multi-tasking CPU'':12 As with most of the design decisions behind \CFA, an important goal is to match the expectation of the programmer according to their execution mental model. 13 To match expectations, the design must offer the programmer sufficient guarantees so that, as long as they respect the execution mental model, the system also respects this model. 14 15 For threading, a simple and common execution mental model is the ``ideal multitasking CPU'': 16 16 17 17 \begin{displayquote}[Linux CFS\cite{MAN:linux/cfs}] 18 {[The]} `` Ideal multi-tasking CPU'' is a (non-existent :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed. For example: if there are 2 tasks running, then it runs each at 50\% physical power --- i.e., actually in parallel.18 {[The]} ``ideal multi-tasking CPU'' is a (non-existent :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed. For example: if there are 2 running tasks, then it runs each at 50\% physical power --- i.e., actually in parallel. 19 19 \label{q:LinuxCFS} 20 20 \end{displayquote} 21 21 22 Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.23 24 In general, the expectation at the cent er of this model is that ready \glspl{thrd}do not interfere with each other but simply share the hardware.25 This assumption makes it easier to reason about threading because ready \ glspl{thrd}can be thought of in isolation and the effect of the scheduler can be virtually ignored.26 This expectation of \ gls{thrd}independence means the scheduler is expected to offer two guarantees:22 Applied to \ats, this model states that every ready \at immediately runs in parallel with all other ready \ats. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model. 23 24 In general, the expectation at the centre of this model is that ready \ats do not interfere with each other but simply share the hardware. 25 This assumption makes it easier to reason about threading because ready \ats can be thought of in isolation and the effect of the scheduler can be virtually ignored. 26 This expectation of \at independence means the scheduler is expected to offer two guarantees: 27 27 \begin{enumerate} 28 \item A fairness guarantee: a \ gls{thrd}that is ready to run is not prevented by another thread.29 \item A performance guarantee: a \ gls{thrd}that wants to start or stop running is not prevented by other threads wanting to do the same.28 \item A fairness guarantee: a \at that is ready to run is not prevented by another thread. 29 \item A performance guarantee: a \at that wants to start or stop running is not prevented by other threads wanting to do the same. 30 30 \end{enumerate} 31 31 32 32 It is important to note that these guarantees are expected only up to a point. 33 \Glspl{ thrd} that are ready to run should not be prevented to doso, but they still share the limited hardware resources.34 Therefore, the guarantee is considered respected if a \ gls{thrd}gets access to a \emph{fair share} of the hardware resources, even if that share is very small.33 \Glspl{at} that are ready to run should not be prevented from doing so, but they still share the limited hardware resources. 34 Therefore, the guarantee is considered respected if a \at gets access to a \emph{fair share} of the hardware resources, even if that share is very small. 35 35 36 36 Similar to the performance guarantee, the lack of interference among threads is only relevant up to a point. … … 40 40 This demonstration can be made by comparing applications built in \CFA to applications built with other languages or other models. 41 41 Recall programmer expectation is that the impact of the scheduler can be ignored. 42 Therefore, if the cost of scheduling is competitive to other popular languages, the guarantee is considerachieved.42 Therefore, if the cost of scheduling is competitive with other popular languages, the guarantee is considered achieved. 43 43 More precisely the scheduler should be: 44 44 \begin{itemize} … … 53 53 In any running system, a \proc can stop dequeuing \ats if it starts running a \at that never blocks. 54 54 Without preemption, traditional work-stealing schedulers do not have starvation freedom in this case. 55 Now this requirement begs the question, what about preemption?56 Generally speaking preemption happens on the timescale of several milliseconds, which brings us to the next requirement: ``fast'' load balancing.55 Now, this requirement begs the question, what about preemption? 56 Generally speaking, preemption happens on the timescale of several milliseconds, which brings us to the next requirement: ``fast'' load balancing. 57 57 58 58 \paragraph{Fast load balancing} means that load balancing should happen faster than preemption would normally allow. 59 For interactive applications that need to run at 60, 90 ,120 frames per second, \ats having to wait for several milliseconds to run are effectively starved.59 For interactive applications that need to run at 60, 90 or 120 frames per second, \ats having to wait for several milliseconds to run are effectively starved. 60 60 Therefore load-balancing should be done at a faster pace, one that can detect starvation at the microsecond scale. 61 With that said, this is a much fuzzier requirement since it depends on the number of \procs, the number of \ats and the general loadof the system.61 With that said, this is a much fuzzier requirement since it depends on the number of \procs, the number of \ats and the general \gls{load} of the system. 62 62 63 63 \subsection{Fairness vs Scheduler Locality} \label{fairnessvlocal} … … 68 68 69 69 For a scheduler, having good locality, \ie, having the data local to each \gls{hthrd}, generally conflicts with fairness. 70 Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \ gls{thrd}, and as consequence cache lines, to a \gls{hthrd} that is currently available.71 Note that this section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how the data used by the application is affected by scheduling.70 Indeed, good locality often requires avoiding the movement of cache lines, while fairness requires dynamically moving a \at, and as consequence cache lines, to a \gls{hthrd} that is currently available. 71 Note that this section discusses \emph{internal locality}, \ie, the locality of the data used by the scheduler versus \emph{external locality}, \ie, how scheduling affects the locality of the application's data. 72 72 External locality is a much more complicated subject and is discussed in the next section. 73 73 74 74 However, I claim that in practice it is possible to strike a balance between fairness and performance because these goals do not necessarily overlap temporally. 75 75 Figure~\ref{fig:fair} shows a visual representation of this behaviour. 76 As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental -model.76 As mentioned, some unfairness is acceptable; therefore it is desirable to have an algorithm that prioritizes cache locality as long as thread delay does not exceed the execution mental model. 77 77 78 78 \begin{figure} … … 80 80 \input{fairness.pstex_t} 81 81 \vspace*{-10pt} 82 \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \ gls{thrd} awaits running is shown as the time the ready \gls{thrd} waits increases, Ready Time, the chances that its data is still in cache decreases, Locality.83 At the same time, the need for fairness increases since other \ glspl{thrd}may have the chance to run many times, breaking the fairness model.82 \caption[Fairness vs Locality graph]{Rule of thumb Fairness vs Locality graph \smallskip\newline The importance of Fairness and Locality while a ready \at awaits running is shown as the time the ready \at waits increases (Ready Time) the chances that its data is still in cache decreases (Locality). 83 At the same time, the need for fairness increases since other \ats may have the chance to run many times, breaking the fairness model. 84 84 Since the actual values and curves of this graph can be highly variable, the graph is an idealized representation of the two opposing goals.} 85 85 \label{fig:fair} … … 92 92 \subsubsection{Scalability} 93 93 The most basic performance challenge of a scheduler is scalability. 94 Given a large number of \procs and an even larger number of \ats, scalability measures how fast \procs can enqueue and dequeue s\ats.95 One could expect that doubling the number of \procs would double the rate at which \ats are dequeued, but contention on the internal data structure of the scheduler can lead to worstimprovements.96 While the ready -queue itself can be sharded to alleviate the main source of contention, auxiliary scheduling features, \eg counting ready \ats, can also be sources of contention.94 Given a large number of \procs and an even larger number of \ats, scalability measures how fast \procs can enqueue and dequeue \ats. 95 One could expect that doubling the number of \procs would double the rate at which \ats are dequeued, but contention on the internal data structure of the scheduler can diminish the improvements. 96 While the ready queue itself can be sharded to alleviate the main source of contention, auxiliary scheduling features, \eg counting ready \ats, can also be sources of contention. 97 97 98 98 \subsubsection{Migration Cost} 99 Another important source of scheduling latency is migration.99 Another important source of scheduling latency is \glslink{atmig}{migration}. 100 100 A \at migrates if it executes on two different \procs consecutively, which is the process discussed in \ref{fairnessvlocal}. 101 101 Migrations can have many different causes, but in certain programs, it can be impossible to limit migration. … … 108 108 The problem is a single point of contention when adding/removing \ats. 109 109 As shown in the evaluation sections, most production schedulers do scale when adding \glspl{hthrd}. 110 The solution to this problem is to shard the ready -queue: create multiple \emph{subqueues} forming the logical ready-queue and the subqueues are accessed by multiple \glspl{hthrd} without interfering.110 The solution to this problem is to shard the ready queue: create multiple \emph{sub-queues} forming the logical ready-queue and the sub-queues are accessed by multiple \glspl{hthrd} without interfering. 111 111 112 112 Before going into the design of \CFA's scheduler, it is relevant to discuss two sharding solutions that served as the inspiration scheduler in this thesis. … … 114 114 \subsection{Work-Stealing} 115 115 116 As mentioned in \ref{existing:workstealing}, a popular sharding approach for the ready -queue is work-stealing.117 In this approach, each \gls{proc} has its own local sub queue and \glspl{proc} only access each other's subqueue if they run out of work on their local ready-queue.116 As mentioned in \ref{existing:workstealing}, a popular sharding approach for the ready queue is work-stealing. 117 In this approach, each \gls{proc} has its own local sub-queue and \glspl{proc} only access each other's sub-queue if they run out of work on their local ready-queue. 118 118 The interesting aspect of work stealing happens in the steady-state scheduling case, \ie all \glspl{proc} have work and no load balancing is needed. 119 119 In this case, work stealing is close to optimal scheduling: it can achieve perfect locality and have no contention. 120 120 On the other hand, work-stealing schedulers only attempt to do load-balancing when a \gls{proc} runs out of work. 121 121 This means that the scheduler never balances unfair loads unless they result in a \gls{proc} running out of work. 122 Chapter~\ref{microbench} shows that pathological caseswork stealing can lead to indefinite starvation.123 124 Based on these observation , the conclusion is that a \emph{perfect} scheduler should behave similarto work-stealing in the steady-state case, but load balance proactively when the need arises.122 Chapter~\ref{microbench} shows that, in pathological cases, work stealing can lead to indefinite starvation. 123 124 Based on these observations, the conclusion is that a \emph{perfect} scheduler should behave similarly to work-stealing in the steady-state case, but load balance proactively when the need arises. 125 125 126 126 \subsection{Relaxed-FIFO} 127 A different scheduling approach is to create a ``relaxed-FIFO'' queue, as in \ todo{cite Trevor's paper}.128 This approach forgoes any ownership between \gls{proc} and sub queue, and simply creates a pool of ready-queues from which \glspl{proc} pick.127 A different scheduling approach is to create a ``relaxed-FIFO'' queue, as in \cite{alistarh2018relaxed}. 128 This approach forgoes any ownership between \gls{proc} and sub-queue, and simply creates a pool of sub-queues from which \glspl{proc} pick. 129 129 Scheduling is performed as follows: 130 130 \begin{itemize} 131 131 \item 132 All sub queues are protected by TryLocks.133 \item 134 Timestamps are added to each element of a sub queue.135 \item 136 A \gls{proc} randomly tests readyqueues until it has acquired one or two queues.137 \item 138 If two queues are acquired, the older of the two \ats at the front the acquired queues is dequeued.139 \item 140 Otherwise the \atsfrom the single queue is dequeued.132 All sub-queues are protected by TryLocks. 133 \item 134 Timestamps are added to each element of a sub-queue. 135 \item 136 A \gls{proc} randomly tests sub-queues until it has acquired one or two queues. 137 \item 138 If two queues are acquired, the older of the two \ats is dequeued from the front of the acquired queues. 139 \item 140 Otherwise, the \at from the single queue is dequeued. 141 141 \end{itemize} 142 142 The result is a queue that has both good scalability and sufficient fairness. 143 143 The lack of ownership ensures that as long as one \gls{proc} is still able to repeatedly dequeue elements, it is unlikely any element will delay longer than any other element. 144 This guarantee contrasts with work-stealing, where a \gls{proc} with a long sub queue results in unfairness for its \ats in comparison to a \gls{proc} with a short subqueue.144 This guarantee contrasts with work-stealing, where a \gls{proc} with a long sub-queue results in unfairness for its \ats in comparison to a \gls{proc} with a short sub-queue. 145 145 This unfairness persists until a \gls{proc} runs out of work and steals. 146 146 147 An important aspect s of this scheme's fairness approach is that the timestamps make it possible to evaluate how long elements have been on the queue.147 An important aspect of this scheme's fairness approach is that the timestamps make it possible to evaluate how long elements have been in the queue. 148 148 However, \glspl{proc} eagerly search for these older elements instead of focusing on specific queues, which negatively affects locality. 149 149 … … 152 152 153 153 \section{Relaxed-FIFO++} 154 The inherent fairness and good performance with many \ats , makesthe relaxed-FIFO queue a good candidate to form the basis of a new scheduler.154 The inherent fairness and good performance with many \ats make the relaxed-FIFO queue a good candidate to form the basis of a new scheduler. 155 155 The problem case is workloads where the number of \ats is barely greater than the number of \procs. 156 In these situations, the wide sharding of the ready queue means most of its sub queues are empty.157 Furthermore, the non-empty sub queues are unlikely to hold more than one item.158 The consequence is that a random dequeue operation is likely to pick an empty sub queue, resulting in an unbounded number of selections.159 This state is generally unstable: each sub queue is likely to frequently toggle between being empty and nonempty.160 Indeed, when the number of \ats is \emph{equal} to the number of \procs, every pop operation is expected to empty a sub queue and every push is expected to add to an empty subqueue.161 In the worst case, a check of the sub queues sees all are empty or full.156 In these situations, the wide sharding of the ready queue means most of its sub-queues are empty. 157 Furthermore, the non-empty sub-queues are unlikely to hold more than one item. 158 The consequence is that a random dequeue operation is likely to pick an empty sub-queue, resulting in an unbounded number of selections. 159 This state is generally unstable: each sub-queue is likely to frequently toggle between being empty and nonempty. 160 Indeed, when the number of \ats is \emph{equal} to the number of \procs, every pop operation is expected to empty a sub-queue and every push is expected to add to an empty sub-queue. 161 In the worst case, a check of the sub-queues sees all are empty or full. 162 162 163 163 As this is the most obvious challenge, it is worth addressing first. 164 The obvious solution is to supplement each sharded sub queue with data that indicates if the queue is empty/nonempty to simplify finding nonempty queues, \ie ready \glspl{at}.165 This sharded data can be organized in different forms, \eg a bitmask or a binary tree that tracks the nonempty sub queues.164 The obvious solution is to supplement each sharded sub-queue with data that indicates if the queue is empty/nonempty to simplify finding nonempty queues, \ie ready \glspl{at}. 165 This sharded data can be organized in different forms, \eg a bitmask or a binary tree that tracks the nonempty sub-queues. 166 166 Specifically, many modern architectures have powerful bitmask manipulation instructions or searching a binary tree has good Big-O complexity. 167 However, precisely tracking nonempty sub queues is problematic.168 The reason is that the sub queues are initially sharded with a width presumably chosen to avoid contention.169 However, tracking which ready queue is nonempty is only useful if the tracking data is dense, \ie denser than the sharded sub queues.170 Otherwise, it does not provide useful information because reading this new data structure risks being as costly as simply picking a sub queue at random.171 But if the tracking mechanism \emph{is} denser than the shared sub queues, than constant updates invariably create a new source of contention.167 However, precisely tracking nonempty sub-queues is problematic. 168 The reason is that the sub-queues are initially sharded with a width presumably chosen to avoid contention. 169 However, tracking which ready queue is nonempty is only useful if the tracking data is dense, \ie denser than the sharded sub-queues. 170 Otherwise, it does not provide useful information because reading this new data structure risks being as costly as simply picking a sub-queue at random. 171 But if the tracking mechanism \emph{is} denser than the shared sub-queues, then constant updates invariably create a new source of contention. 172 172 Early experiments with this approach showed that randomly picking, even with low success rates, is often faster than bit manipulations or tree walks. 173 173 174 174 The exception to this rule is using local tracking. 175 If each \proc locally keeps track of empty sub queues, than this can be done with a very dense data structure without introducing a new source of contention.175 If each \proc locally keeps track of empty sub-queues, then this can be done with a very dense data structure without introducing a new source of contention. 176 176 However, the consequence of local tracking is that the information is incomplete. 177 Each \proc is only aware of the last state it saw about each sub queue so this information quickly becomes stale.177 Each \proc is only aware of the last state it saw about each sub-queue so this information quickly becomes stale. 178 178 Even on systems with low \gls{hthrd} count, \eg 4 or 8, this approach can quickly lead to the local information being no better than the random pick. 179 179 This result is due in part to the cost of maintaining information and its poor quality. 180 180 181 However, using a very low cost but inaccurate approach for local tracking can actuallybe beneficial.182 If the local tracking is no more costly than a random pick, th an \emph{any} improvement to the success rate, however low it is, leads to a performance benefits.183 This suggests t o the following approach:181 However, using a very low-cost but inaccurate approach for local tracking can still be beneficial. 182 If the local tracking is no more costly than a random pick, then \emph{any} improvement to the success rate, however low it is, leads to a performance benefit. 183 This suggests the following approach: 184 184 185 185 \subsection{Dynamic Entropy}\cite{xkcd:dynamicentropy} 186 The Relaxed-FIFO approach can be made to handle the case of mostly empty sub queues by tweaking the \glsxtrlong{prng}.187 The \glsxtrshort{prng} state can be seen as containing a list of all the future sub queues that will be accessed.188 While this concept is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the sub queues that were accessed.186 The Relaxed-FIFO approach can be made to handle the case of mostly empty sub-queues by tweaking the \glsxtrlong{prng}. 187 The \glsxtrshort{prng} state can be seen as containing a list of all the future sub-queues that will be accessed. 188 While this concept is not particularly useful on its own, the consequence is that if the \glsxtrshort{prng} algorithm can be run \emph{backwards}, then the state also contains a list of all the sub-queues that were accessed. 189 189 Luckily, bidirectional \glsxtrshort{prng} algorithms do exist, \eg some Linear Congruential Generators\cite{wiki:lcg} support running the algorithm backwards while offering good quality and performance. 190 190 This particular \glsxtrshort{prng} can be used as follows: 191 191 \begin{itemize} 192 192 \item 193 Each \proc maintains two \glsxtrshort{prng} states, refer eed to as $F$ and $B$.194 \item 195 When a \proc attempts to dequeue a \at, it picks a sub queue by running $B$ backwards.196 \item 197 When a \proc attempts to enqueue a \at, it runs $F$ forward picking a sub queue to enqueue to.198 If the enqueue is successful, thestate $B$ is overwritten with the content of $F$.193 Each \proc maintains two \glsxtrshort{prng} states, referred to as $F$ and $B$. 194 \item 195 When a \proc attempts to dequeue a \at, it picks a sub-queue by running $B$ backwards. 196 \item 197 When a \proc attempts to enqueue a \at, it runs $F$ forward picking a sub-queue to enqueue to. 198 If the enqueue is successful, state $B$ is overwritten with the content of $F$. 199 199 \end{itemize} 200 200 The result is that each \proc tends to dequeue \ats that it has itself enqueued. 201 When most sub queues are empty, this technique increases the odds of finding \ats atvery low cost, while also offering an improvement on locality in many cases.201 When most sub-queues are empty, this technique increases the odds of finding \ats at a very low cost, while also offering an improvement on locality in many cases. 202 202 203 203 Tests showed this approach performs better than relaxed-FIFO in many cases. 204 204 However, it is still not competitive with work-stealing algorithms. 205 205 The fundamental problem is that the constant randomness limits how much locality the scheduler offers. 206 This becomes problematic both because the scheduler is likely to get cache misses on internal data -structures and because migrations become frequent.206 This becomes problematic both because the scheduler is likely to get cache misses on internal data structures and because migrations become frequent. 207 207 Therefore, the attempt to modify the relaxed-FIFO algorithm to behave more like work stealing did not pan out. 208 208 The alternative is to do it the other way around. … … 210 210 \section{Work Stealing++}\label{helping} 211 211 To add stronger fairness guarantees to work stealing a few changes are needed. 212 First, the relaxed-FIFO algorithm has fundamentally better fairness because each \proc always monitors all sub queues.212 First, the relaxed-FIFO algorithm has fundamentally better fairness because each \proc always monitors all sub-queues. 213 213 Therefore, the work-stealing algorithm must be prepended with some monitoring. 214 Before attempting to dequeue from a \proc's sub queue, the \proc must make some effort to ensure other subqueues are not being neglected.214 Before attempting to dequeue from a \proc's sub-queue, the \proc must make some effort to ensure other sub-queues are not being neglected. 215 215 To make this possible, \procs must be able to determine which \at has been on the ready queue the longest. 216 216 Second, the relaxed-FIFO approach needs timestamps for each \at to make this possible. … … 219 219 \centering 220 220 \input{base.pstex_t} 221 \caption[Base \CFA design]{Base \CFA design \smallskip\newline A pool of sub queues offers the sharding, two per \glspl{proc}.222 Each \gls{proc} can access all of the sub queues.221 \caption[Base \CFA design]{Base \CFA design \smallskip\newline A pool of sub-queues offers the sharding, two per \proc. 222 Each \gls{proc} can access all of the sub-queues. 223 223 Each \at is timestamped when enqueued.} 224 224 \label{fig:base} … … 226 226 227 227 Figure~\ref{fig:base} shows the algorithm structure. 228 This structure is similar to classic work-stealing except the sub queues are placed in an array so \procs can access them in constant time.228 This structure is similar to classic work-stealing except the sub-queues are placed in an array so \procs can access them in constant time. 229 229 Sharding width can be adjusted based on contention. 230 230 Note, as an optimization, the TS of a \at is stored in the \at in front of it, so the first TS is in the array and the last \at has no TS. 231 231 This organization keeps the highly accessed front TSs directly in the array. 232 When a \proc attempts to dequeue a \at, it first picks a random remote sub queue and compares its timestamp to the timestamps of its local subqueue(s).232 When a \proc attempts to dequeue a \at, it first picks a random remote sub-queue and compares its timestamp to the timestamps of its local sub-queue(s). 233 233 The oldest waiting \at is dequeued to provide global fairness. 234 234 235 However, this na\"ive implement edhas performance problems.235 However, this na\"ive implementation has performance problems. 236 236 First, it is necessary to have some damping effect on helping. 237 237 Random effects like cache misses and preemption can add spurious but short bursts of latency negating the attempt to help. 238 These bursts can cause increased migrations and make this work stealing approach slowdown to the level of relaxed-FIFO.238 These bursts can cause increased migrations and make this work-stealing approach slow down to the level of relaxed-FIFO. 239 239 240 240 \begin{figure} 241 241 \centering 242 242 \input{base_avg.pstex_t} 243 \caption[\CFA design with Moving Average]{\CFA design with Moving Average \smallskip\newline A moving average is added to each sub queue.}243 \caption[\CFA design with Moving Average]{\CFA design with Moving Average \smallskip\newline A moving average is added to each sub-queue.} 244 244 \label{fig:base-ma} 245 245 \end{figure} 246 246 247 A simple solution to this problem is to use an exponential moving average\cite{wiki:ma} (MA) instead of a raw timestamp s,shown in Figure~\ref{fig:base-ma}.248 Note , this is more complex because the \at at the head of a subqueue is still waiting, so its wait time has not ended.249 Therefore, the exponential moving average is a ctually an exponential movingaverage of how long each dequeued \at has waited.250 To compare sub queues, the timestamp at the head must be compared to the current time, yielding the best-case wait-time for the \at at the head of the queue.247 A simple solution to this problem is to use an exponential moving average\cite{wiki:ma} (MA) instead of a raw timestamp, as shown in Figure~\ref{fig:base-ma}. 248 Note that this is more complex because the \at at the head of a sub-queue is still waiting, so its wait time has not ended. 249 Therefore, the exponential moving average is an average of how long each dequeued \at has waited. 250 To compare sub-queues, the timestamp at the head must be compared to the current time, yielding the best-case wait time for the \at at the head of the queue. 251 251 This new waiting is averaged with the stored average. 252 To further limit migration, a bias can be added to a local subqueue, where a remote subqueue is helped only if its moving average is more than $X$ times the local subqueue's average.252 To further limit \glslink{atmig}{migrations}, a bias can be added to a local sub-queue, where a remote sub-queue is helped only if its moving average is more than $X$ times the local sub-queue's average. 253 253 Tests for this approach indicate the choice of the weight for the moving average or the bias is not important, \ie weights and biases of similar \emph{magnitudes} have similar effects. 254 254 255 255 With these additions to work stealing, scheduling can be made as fair as the relaxed-FIFO approach, avoiding the majority of unnecessary migrations. 256 Unfortunately, the work to achieve fairness has a performance cost, especially when the workload is inherently fair, and hence, there is only short-term or no starvation.257 The problem is that the constant polling, \ie reads, of remote sub queues generally entail a cache miss because the TSs are constantly being updated, \ie, writes.258 To make things worst, remote sub queues that are very active, \ie \ats are frequently enqueued and dequeued from them, lead to higher chances that polling will incur a cache-miss.259 Conversely, the active sub queues do not benefit much from helping since starvation is already a non-issue.260 This puts this algorithm in the awkward situation of paying for a cost that is largely unnecessary.261 The good news is that this problem can be mitigated 262 263 \subsection{Redundant Timestamps}\ ref{relaxedtimes}264 The problem with polling remote sub queues is that correctness is critical.265 There must be a consensus among \procs on which sub queues hold which \ats, as the \ats are in constant motion.266 Furthermore, since timestamps are use for fairness, it is critical to haveconsensus on which \at is the oldest.267 However, when deciding if a remote sub queue is worth polling, correctness is less of a problem.268 Since the only requirement is that a sub queue is eventually polled, some data staleness is acceptable.256 Unfortunately, the work to achieve fairness has a performance cost, especially when the workload is inherently fair, and hence, there is only short-term unfairness or no starvation. 257 The problem is that the constant polling, \ie reads, of remote sub-queues generally entails cache misses because the TSs are constantly being updated, \ie, writes. 258 To make things worst, remote sub-queues that are very active, \ie \ats are frequently enqueued and dequeued from them, lead to higher chances that polling will incur a cache-miss. 259 Conversely, the active sub-queues do not benefit much from helping since starvation is already a non-issue. 260 This puts this algorithm in the awkward situation of paying for a largely unnecessary cost. 261 The good news is that this problem can be mitigated. 262 263 \subsection{Redundant Timestamps}\label{relaxedtimes} 264 The problem with polling remote sub-queues is that correctness is critical. 265 There must be a consensus among \procs on which sub-queues hold which \ats, as the \ats are in constant motion. 266 Furthermore, since timestamps are used for fairness, it is critical to have a consensus on which \at is the oldest. 267 However, when deciding if a remote sub-queue is worth polling, correctness is less of a problem. 268 Since the only requirement is that a sub-queue is eventually polled, some data staleness is acceptable. 269 269 This leads to a situation where stale timestamps are only problematic in some cases. 270 Furthermore, stale timestamps can be desirable since lower freshness requirements mean lesscache invalidations.270 Furthermore, stale timestamps can be desirable since lower freshness requirements mean fewer cache invalidations. 271 271 272 272 Figure~\ref{fig:base-ts2} shows a solution with a second array containing a copy of the timestamps and average. 273 This copy is updated \emph{after} the sub queue's critical sections using relaxed atomics.273 This copy is updated \emph{after} the sub-queue's critical sections using relaxed atomics. 274 274 \Glspl{proc} now check if polling is needed by comparing the copy of the remote timestamp instead of the actual timestamp. 275 275 The result is that since there is no fencing, the writes can be buffered in the hardware and cause fewer cache invalidations. … … 279 279 \input{base_ts2.pstex_t} 280 280 \caption[\CFA design with Redundant Timestamps]{\CFA design with Redundant Timestamps \smallskip\newline An array is added containing a copy of the timestamps. 281 These timestamps are written 281 These timestamps are written-to with relaxed atomics, so there is no order among concurrent memory accesses, leading to fewer cache invalidations.} 282 282 \label{fig:base-ts2} 283 283 \end{figure} … … 285 285 The correctness argument is somewhat subtle. 286 286 The data used for deciding whether or not to poll a queue can be stale as long as it does not cause starvation. 287 Therefore, it is acceptable if stale data makes queues appear older than they reallyare but appearing fresher can be a problem.288 For the timestamps, this means missing writes to the timestamp is acceptablesince they make the head \at look older.287 Therefore, it is acceptable if stale data makes queues appear older than they are but appearing fresher can be a problem. 288 For the timestamps, this means it is acceptable to miss writes to the timestamp since they make the head \at look older. 289 289 For the moving average, as long as the operations are just atomic reads/writes, the average is guaranteed to yield a value that is between the oldest and newest values written. 290 Therefore, this unprotected read of the timestamp and average satisf ythe limited correctness that is required.290 Therefore, this unprotected read of the timestamp and average satisfies the limited correctness that is required. 291 291 292 292 With redundant timestamps, this scheduling algorithm achieves both the fairness and performance requirements on most machines. 293 293 The problem is that the cost of polling and helping is not necessarily consistent across each \gls{hthrd}. 294 For example , on machines with a CPU containing multiple hyperthreads and cores and multiple CPU sockets, cache misses can be satisfied from the caches onsame (local) CPU, or by a CPU on a different (remote) socket.294 For example on machines with a CPU containing multiple hyper threads and cores and multiple CPU sockets, cache misses can be satisfied from the caches on the same (local) CPU, or by a CPU on a different (remote) socket. 295 295 Cache misses satisfied by a remote CPU have significantly higher latency than from the local CPU. 296 296 However, these delays are not specific to systems with multiple CPUs. … … 313 313 In Figure~\ref{fig:cache-share}, all cache misses are either private to a CPU or shared with another CPU. 314 314 This means latency due to cache misses is fairly consistent. 315 In contrast, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by either instance of L3 cache.315 In contrast, in Figure~\ref{fig:cache-noshare} misses in the L2 cache can be satisfied by either instance of the L3 cache. 316 316 However, the memory-access latency to the remote L3 is higher than the memory-access latency to the local L3. 317 317 The impact of these different designs on this algorithm is that scheduling only scales well on architectures with a wide L3 cache, similar to Figure~\ref{fig:cache-share}, and less well on architectures with many narrower L3 cache instances, similar to Figure~\ref{fig:cache-noshare}. 318 Hence, as the number of L3 instances grow , so too does the chance that the random helping causes significant cache latency.319 The solution is for the scheduler be aware of the cache topology.318 Hence, as the number of L3 instances grows, so too does the chance that the random helping causes significant cache latency. 319 The solution is for the scheduler to be aware of the cache topology. 320 320 321 321 \subsection{Per CPU Sharding} … … 323 323 Unfortunately, there is no portable way to discover cache topology, and it is outside the scope of this thesis to solve this problem. 324 324 This work uses the cache topology information from Linux's @/sys/devices/system/cpu@ directory. 325 This leaves the challenge of matching \procs to cache structure, or more precisely identifying which sub queues of the ready queue are local to which subcomponents of the cache structure.326 Once a match ing is generated, the helping algorithm is changed to add bias so that \procs more often help subqueues local to the same cache substructure.\footnote{325 This leaves the challenge of matching \procs to cache structure, or more precisely identifying which sub-queues of the ready queue are local to which subcomponents of the cache structure. 326 Once a match is generated, the helping algorithm is changed to add bias so that \procs more often help sub-queues local to the same cache substructure.\footnote{ 327 327 Note that like other biases mentioned in this section, the actual bias value does not appear to need precise tuning.} 328 328 329 The simplest approach for mapping sub queues to cache structure is to statically tie subqueues to CPUs.330 Instead of having each sub queue local to a specific \proc, the system is initialized with subqueues for each hardware hyperthread/core up front.331 Then \procs dequeue and enqueue by first asking which CPU id they are executing on, in order to identify which subqueues are the local ones.329 The simplest approach for mapping sub-queues to cache structure is to statically tie sub-queues to CPUs. 330 Instead of having each sub-queue local to a specific \proc, the system is initialized with sub-queues for each hardware hyperthread/core up front. 331 Then \procs dequeue and enqueue by first asking which CPU id they are executing on, to identify which sub-queues are the local ones. 332 332 \Glspl{proc} can get the CPU id from @sched_getcpu@ or @librseq@. 333 333 334 334 This approach solves the performance problems on systems with topologies with narrow L3 caches, similar to Figure \ref{fig:cache-noshare}. 335 335 However, it can still cause some subtle fairness problems in systems with few \procs and many \glspl{hthrd}. 336 In this case, the large number of sub queues and the bias against subqueues tied to different cache substructures make it unlikely that every subqueue is picked.337 To make things worst, the small number of \procs mean that few helping attempts are made.338 This combination of low selection and few helping attempts allow a \at to become stranded on a sub queue for a long time until it gets randomly helped.339 On a system with 2 \procs, 256 \glspl{hthrd} with narrow cache sharing, and a 100:1 bias, it can actuallytake multiple seconds for a \at to get dequeued from a remote queue.340 Therefore, a more dynamic match ing of subqueues to cache instanceis needed.336 In this case, the large number of sub-queues and the bias against sub-queues tied to different cache substructures make it unlikely that every sub-queue is picked. 337 To make things worst, the small number of \procs means that few helping attempts are made. 338 This combination of low selection and few helping attempts allow a \at to become stranded on a sub-queue for a long time until it gets randomly helped. 339 On a system with 2 \procs, 256 \glspl{hthrd} with narrow cache sharing, and a 100:1 bias, it can take multiple seconds for a \at to get dequeued from a remote queue. 340 Therefore, a more dynamic match of sub-queues to cache instances is needed. 341 341 342 342 \subsection{Topological Work Stealing} 343 343 \label{s:TopologicalWorkStealing} 344 The refore, the approach used in the \CFA scheduler is to have per-\proc subqueues, but have an explicit data-structure track which cache substructure each subqueue is tied to.344 The approach used in the \CFA scheduler is to have per-\proc sub-queues, but have an explicit data structure to track which cache substructure each sub-queue is tied to. 345 345 This tracking requires some finesse because reading this data structure must lead to fewer cache misses than not having the data structure in the first place. 346 A key element howeveris that, like the timestamps for helping, reading the cache instance mapping only needs to give the correct result \emph{often enough}.346 A key element, however, is that, like the timestamps for helping, reading the cache instance mapping only needs to give the correct result \emph{often enough}. 347 347 Therefore the algorithm can be built as follows: before enqueueing or dequeuing a \at, each \proc queries the CPU id and the corresponding cache instance. 348 Since sub queues are tied to \procs, each \proc can then update the cache instance mapped to the local subqueue(s).349 To avoid unnecessary cache line invalidation, the map is only written 348 Since sub-queues are tied to \procs, each \proc can then update the cache instance mapped to the local sub-queue(s). 349 To avoid unnecessary cache line invalidation, the map is only written-to if the mapping changes. 350 350 351 351 This scheduler is used in the remainder of the thesis for managing CPU execution, but additional scheduling is needed to handle long-term blocking and unblocking, such as I/O. -
doc/theses/thierry_delisle_PhD/thesis/text/eval_macro.tex
rebf8ca5 r23a08aa0 2 2 The previous chapter demonstrated the \CFA scheduler achieves its equivalent performance goal in small and controlled \at-scheduling scenarios. 3 3 The next step is to demonstrate performance stays true in more realistic and complete scenarios. 4 Therefore, this chapter exercises both \at and I/O scheduling using two flavours of web servers that demonstrate \CFA performs competitively withproduction environments.5 6 Web servers are chosen because they offer fairly simple applications that perform complex I/O, both network and disk, and are useful as standalone products.7 Furthermore, web servers are generally amenable to parallelization since their workloads are mostly homogeneous.8 Therefore, web servers offer a stringent performance benchmark for \CFA.9 Indeed, existing web servers have close to optimal performance, while the homogeneity of the workload means fairness may not be a problem.10 As such, these experiments should highlight the overhead tue to any \CFA fairness cost in realistic scenarios.4 Therefore, this chapter exercises both \at and I/O scheduling using two flavours of web servers that demonstrate \CFA performs competitively compared to web servers used in production environments. 5 6 Web servers are chosen because they offer fairly simple applications that perform complex I/O, both network and disk, and are useful as standalone products. 7 Furthermore, web servers are generally amenable to parallelization since their workloads are mostly homogeneous. 8 Therefore, web servers offer a stringent performance benchmark for \CFA. 9 Indeed, existing web servers have close to optimal performance, while the homogeneity of the workload means fairness may not be a problem. 10 As such, these experiments should highlight the overhead due to any \CFA fairness cost in realistic scenarios. 11 11 12 12 \section{Memcached} 13 13 Memcached~\cite{memcached} is an in-memory key-value store used in many production environments, \eg \cite{atikoglu2012workload}. 14 In fact, the Memcached server is so popular there exists a full-featured front-end for performance testing, called @mutilate@~\cite{GITHUB:mutilate}.15 Experimenting on Memcached allows for a simple test of the \CFA runtime as a whole, exercising the scheduler, the idle-sleep mechanism, as well the \io subsystem for sockets.16 Note , this experiment does not exercise the \io subsystem with regardsto disk operations because Memcached is an in-memory server.14 The Memcached server is so popular there exists a full-featured front-end for performance testing, called @mutilate@~\cite{GITHUB:mutilate}. 15 Experimenting on Memcached allows for a simple test of the \CFA runtime as a whole, exercising the scheduler, the idle-sleep mechanism, as well as the \io subsystem for sockets. 16 Note that this experiment does not exercise the \io subsystem with regard to disk operations because Memcached is an in-memory server. 17 17 18 18 \subsection{Benchmark Environment} … … 24 24 Each node has 2 Intel(R) Xeon(R) CPU E5-2620 v2 running at 2.10GHz. 25 25 \item 26 These CPUs have 6 cores per CPUs and 2 \glspl{hthrd} per core, for a total of 24 \glspl{hthrd}.27 \item 28 The CPUs each have 384 KB, 3 MB and 30 MB of L1, L2 and L3 cachesrespectively.29 \item 30 Each node isconnected to the network through a Mellanox 10 Gigabit Ethernet port.26 Each CPU has 6 cores and 2 \glspl{hthrd} per core, for a total of 24 \glspl{hthrd}. 27 \item 28 A CPU has 384 KB, 3 MB and 30 MB of L1, L2 and L3 caches, respectively. 29 \item 30 The compute nodes are connected to the network through a Mellanox 10 Gigabit Ethernet port. 31 31 \item 32 32 Network routing is performed by a Mellanox SX1012 10/40 Gigabit Ethernet switch. … … 35 35 \subsection{Memcached threading}\label{memcd:thrd} 36 36 Memcached can be built to use multiple threads in addition to its @libevent@ subsystem to handle requests. 37 When enabled, the threading implementation operates as follows~\cite {https://docs.oracle.com/cd/E17952_01/mysql-5.6-en/ha-memcached-using-threads.html}:37 When enabled, the threading implementation operates as follows~\cite[\S~16.2.2.8]{MemcachedThreading}: 38 38 \begin{itemize} 39 39 \item … … 48 48 For UDP connections, all the threads listen to a single UDP socket for incoming requests. 49 49 Threads that are not currently dealing with another request ignore the incoming packet. 50 One of the remaining, non busy, threads reads the request and sends the response.51 This implementation can lead to increased CPU loadas threads wake from sleep to potentially process the request.52 \end{itemize} 53 Here, Memcached is based on an event-based web server architecture~\cite{Pai99Flash}, using \gls{kthrd}ing to run multiple largely independent event engines, and if needed, spinning up additional kernel threads to handle blocking I/O.54 Alternative web server architectureare:50 One of the remaining, non-busy, threads reads the request and sends the response. 51 This implementation can lead to increased CPU \gls{load} as threads wake from sleep to potentially process the request. 52 \end{itemize} 53 Here, Memcached is based on an event-based web server architecture~\cite{Pai99Flash}, using \gls{kthrd}ing to run multiple largely independent event engines, and if needed, spinning up additional kernel threads to handle blocking I/O. 54 Alternative web server architectures are: 55 55 \begin{itemize} 56 56 \item … … 74 74 \item \emph{vanilla}: the official release of Memcached, version~1.6.9. 75 75 \item \emph{fibre}: a modification of vanilla using the thread-per-connection model on top of the libfibre runtime. 76 \item \emph{cfa}: a modification of the fibre web server that replaces the libfibre runtime with \CFA.76 \item \emph{cfa}: a modification of the fibre web server that replaces the libfibre runtime with \CFA. 77 77 \end{itemize} 78 78 … … 80 80 This experiment is done by having the clients establish 15,360 total connections, which persist for the duration of the experiment. 81 81 The clients then send read and write queries with only 3\% writes (updates), attempting to follow a desired query rate, and the server responds to the desired rate as best as possible. 82 Figure~\ref{fig:memcd:rate:qps} shows the 3 server versions at different client rates, ``Target \underline{Q}ueries \underline{P}er \underline{S}econd'', and the actual rate, ``Actual QPS'', for all three web servers.83 84 Like the experimental setup in Chapter~\ref{microbench}, each experiment is run 15 times, and for each client rate, the measured web server rate is plotted.82 Figure~\ref{fig:memcd:rate:qps} shows the 3 server versions at different client rates, ``Target \underline{Q}ueries \underline{P}er \underline{S}econd'', and the actual rate, ``Actual QPS'', for all three web servers. 83 84 Like the experimental setup in Chapter~\ref{microbench}, each experiment is run 15 times, and for each client rate, the measured web server rate is plotted. 85 85 The solid line represents the median while the dashed and dotted lines represent the maximum and minimum respectively. 86 For rates below 500K queries per second s, all three webservers match the client rate.87 Beyond 500K, the web servers cannot match the client rate.88 During this interval, vanilla Memcached achieves the highest web server throughput, with libfibre and \CFA slightly lower but very similar throughput.89 Overall the performance of all three web servers is very similar, especially considering that at 500K the servers have reached saturation, which is discussed more in the next section.86 For rates below 500K queries per second, all three web servers match the client rate. 87 Beyond 500K, the web servers cannot match the client rate. 88 During this interval, vanilla Memcached achieves the highest web server throughput, with libfibre and \CFA slightly lower but very similar throughput. 89 Overall the performance of all three web servers is very similar, especially considering that at 500K the servers have reached saturation, which is discussed more in the next section. 90 90 91 91 \begin{figure} 92 92 \centering 93 93 \resizebox{0.83\linewidth}{!}{\input{result.memcd.rate.qps.pstex_t}} 94 \caption[Memcached Benchmark: Throughput]{Memcached Benchmark: Throughput\smallskip\newline Desired vs Actual query rate for 15,360 connections. Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server is able torespond.}94 \caption[Memcached Benchmark: Throughput]{Memcached Benchmark: Throughput\smallskip\newline Desired vs Actual query rate for 15,360 connections. Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server can respond.} 95 95 \label{fig:memcd:rate:qps} 96 96 %\end{figure} … … 99 99 \centering 100 100 \resizebox{0.83\linewidth}{!}{\input{result.memcd.rate.99th.pstex_t}} 101 \caption[Memcached Benchmark : 99th Percentile Lantency]{Memcached Benchmark : 99th Percentile Lantency\smallskip\newline 99th Percentile of the response latency as a function of \emph{desired} query rate for 15,360 connections. }101 \caption[Memcached Benchmark: 99th Percentile Latency]{Memcached Benchmark: 99th Percentile Latency\smallskip\newline 99th Percentile of the response latency as a function of \emph{desired} query rate for 15,360 connections. } 102 102 \label{fig:memcd:rate:tail} 103 103 \end{figure} 104 104 105 105 \subsection{Tail Latency} 106 Another popular performance metric is \newterm{tail} latency, which indicates some notion of fairness among requests across the experiment, \ie do some requests wait longer than other requests for service .106 Another popular performance metric is \newterm{tail} latency, which indicates some notion of fairness among requests across the experiment, \ie do some requests wait longer than other requests for service? 107 107 Since many web applications rely on a combination of different queries made in parallel, the latency of the slowest response, \ie tail latency, can dictate a performance perception. 108 108 Figure~\ref{fig:memcd:rate:tail} shows the 99th percentile latency results for the same Memcached experiment. 109 109 110 110 Again, each experiment is run 15 times with the median, maximum and minimum plotted with different lines. 111 As expected, the latency starts low and increases as the server gets close to saturation, at which point, the latency increases dramatically because the web servers cannot keep up with the connection rate so client requests are disproportionally delayed.112 Because of this dramatic increase, the Y axis is presented usinglog scale.113 Note that the graph shows \emph{target} query rate, the actual response rate is given in Figure~\ref{fig:memcd:rate:qps} as this is the same underlying experiment.114 115 For all three servers, the saturation point is reached before 500K queries per second, which is when throughput starts to decline among the web servers.116 In this experiment, all three web servers are much more distinguishable than the throughput experiment.117 Vanilla Memcached achieves the lowest latency until 600K, after which all the web servers are struggling to respond to client requests.111 As expected, the latency starts low and increases as the server gets close to saturation, at which point, the latency increases dramatically because the web servers cannot keep up with the connection rate so client requests are disproportionally delayed. 112 Because of this dramatic increase, the Y-axis is presented using a log scale. 113 Note that the graph shows the \emph{target} query rate, the actual response rate is given in Figure~\ref{fig:memcd:rate:qps} as this is the same underlying experiment. 114 115 For all three servers, the saturation point is reached before 500K queries per second, which is when throughput starts to decline among the web servers. 116 In this experiment, all three web servers are much more distinguishable than in the throughput experiment. 117 Vanilla Memcached achieves the lowest latency until 600K, after which all the web servers are struggling to respond to client requests. 118 118 \CFA begins to decline at 600K, indicating some bottleneck after saturation. 119 Overall, all three web servers achieve micro-second latencies and the increases in latency mostly follow each other.119 Overall, all three web servers achieve microsecond latencies and the increases in latency mostly follow each other. 120 120 121 121 \subsection{Update rate} 122 Since Memcached is effectively a simple database, the information that is cachedcan be written to concurrently by multiple queries.122 Since Memcached is effectively a simple database, the cache information can be written to concurrently by multiple queries. 123 123 And since writes can significantly affect performance, it is interesting to see how varying the update rate affects performance. 124 124 Figure~\ref{fig:memcd:updt} shows the results for the same experiment as the throughput and latency experiment but increasing the update percentage to 5\%, 10\% and 50\%, respectively, versus the original 3\% update percentage. 125 125 126 126 \begin{figure} 127 \hspace{-15pt} 127 128 \subfloat[][\CFA: Throughput]{ 128 129 \resizebox{0.5\linewidth}{!}{ … … 132 133 } 133 134 \subfloat[][\CFA: Latency]{ 134 \resizebox{0.5 \linewidth}{!}{135 \resizebox{0.52\linewidth}{!}{ 135 136 \input{result.memcd.forall.lat.pstex_t} 136 137 } … … 138 139 } 139 140 141 \hspace{-15pt} 140 142 \subfloat[][LibFibre: Throughput]{ 141 143 \resizebox{0.5\linewidth}{!}{ … … 145 147 } 146 148 \subfloat[][LibFibre: Latency]{ 147 \resizebox{0.5 \linewidth}{!}{149 \resizebox{0.52\linewidth}{!}{ 148 150 \input{result.memcd.fibre.lat.pstex_t} 149 151 } … … 151 153 } 152 154 155 \hspace{-15pt} 153 156 \subfloat[][Vanilla: Throughput]{ 154 157 \resizebox{0.5\linewidth}{!}{ … … 158 161 } 159 162 \subfloat[][Vanilla: Latency]{ 160 \resizebox{0.5 \linewidth}{!}{163 \resizebox{0.52\linewidth}{!}{ 161 164 \input{result.memcd.vanilla.lat.pstex_t} 162 165 } 163 166 \label{fig:memcd:updt:vanilla:lat} 164 167 } 165 \caption[Throughput and Latency results at different update rates (percentage of writes).]{Throughput and Latency results at different update rates (percentage of writes).\smallskip\newline Description} 168 \caption[Throughput and Latency results at different update rates (percentage of writes).]{Throughput and Latency results at different update rates (percentage of writes).\smallskip\newline On the left, throughput as Desired vs Actual query rate. 169 Target QPS is the query rate that the clients are attempting to maintain and Actual QPS is the rate at which the server can respond. 170 On the right, tail latency, \ie 99th Percentile of the response latency as a function of \emph{desired} query rate. 171 For throughput, higher is better, for tail-latency, lower is better. 172 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 173 All runs have 15,360 client connections. 166 174 \label{fig:memcd:updt} 167 175 \end{figure} … … 175 183 \section{Static Web-Server} 176 184 The Memcached experiment does not exercise two key aspects of the \io subsystem: accept\-ing new connections and interacting with disks. 177 On the other hand, a web server servicing static web-pages does stress both accepting connections and disk \io by accepting tens of thousands of client requests per second where these requests return static data serviced from the file-system cache or disk.\footnote{178 Webservers servicing dynamic requests, which read from multiple locations and construct a response, are not as interesting since creating the response takes more time and does not exercise the runtime in a meaningfully different way.}179 The static web server experiment compares NGINX~\cite{nginx} with a custom \CFA-based webserver developed for this experiment.185 On the other hand, a web server servicing static web pages does stress both accepting connections and disk \io by accepting tens of thousands of client requests per second where these requests return static data serviced from the file-system cache or disk.\footnote{ 186 web servers servicing dynamic requests, which read from multiple locations and construct a response, are not as interesting since creating the response takes more time and does not exercise the runtime in a meaningfully different way.} 187 The static web server experiment compares NGINX~\cite{nginx} with a custom \CFA-based web server developed for this experiment. 180 188 181 189 \subsection{NGINX threading} 182 Like memcached, NGINX can be makde to use multiple \glspl{kthrd}. 183 It has a very similar architecture to the memcached architecture decscribed in Section~\ref{memcd:thrd}, where multiple \glspl{kthrd} each run a mostly independent network logic. 184 While it does not necessarily use a dedicated listening thread, each connection is arbitrarily assigned to one of the \newterm{worker} threads. 185 Each worker threads handles multiple connections exclusively, effectively dividing the connections into distinct sets. 186 Again, this is effectively the \emph{event-based server} approach. 187 188 \cit{https://www.nginx.com/blog/inside-nginx-how-we-designed-for-performance-scale/} 189 190 191 \subsection{\CFA webserver} 192 The \CFA webserver is a straightforward thread-per-connection webserver, where a fixed number of \ats are created upfront. 190 NGINX is a high-performance, \emph{full-service}, event-driven web server. 191 It can handle both static and dynamic web content, as well as serve as a reverse proxy and a load balancer~\cite{reese2008nginx}. 192 This wealth of capabilities comes with a variety of potential configurations, dictating available features and performance. 193 The NGINX server runs a master process that performs operations such as reading configuration files, binding to ports, and controlling worker processes. 194 When running as a static web server, it uses an event-driven architecture to service incoming requests. 195 Incoming connections are assigned a \emph{stackless} HTTP state machine and worker processes can handle thousands of these state machines. 196 For the following experiment, NGINX is configured to use @epoll@ to listen for events on these state machines and have each worker process independently accept new connections. 197 Because of the realities of Linux, see Subsection~\ref{ononblock}, NGINX also maintains a pool of auxiliary threads to handle blocking \io. 198 The configuration can set the number of worker processes desired, as well as the size of the auxiliary pool. 199 However, for the following experiments, NGINX is configured to let the master process decide the appropriate number of threads. 200 201 \subsection{\CFA web server} 202 The \CFA web server is a straightforward thread-per-connection web server, where a fixed number of \ats are created upfront. 193 203 Each \at calls @accept@, through @io_uring@, on the listening port and handles the incoming connection once accepted. 194 204 Most of the implementation is fairly straightforward; 195 205 however, the inclusion of file \io found an @io_uring@ problem that required an unfortunate workaround. 196 206 197 Normally, web servers use @sendfile@~\cite{MAN:sendfile} to send files over a socket because it performs a direct move in the kernel from the file-system cache to the NIC, eliminating reading/writing the file into the webserver.198 While @io_uring@ does not support @sendfile@, it does support s@splice@~\cite{MAN:splice}, which is strictly more powerful.199 However, because of how Linux implements file \io, see Subsection~\ref{ononblock}, @io_uring@ must delegate splice calls to worker threads insidethe kernel.207 Normally, web servers use @sendfile@~\cite{MAN:sendfile} to send files over a socket because it performs a direct move in the kernel from the file-system cache to the NIC, eliminating reading/writing the file into the web server. 208 While @io_uring@ does not support @sendfile@, it does support @splice@~\cite{MAN:splice}, which is strictly more powerful. 209 However, because of how Linux implements file \io, see Subsection~\ref{ononblock}, @io_uring@ must delegate splice calls to worker threads \emph{inside} the kernel. 200 210 As of Linux 5.13, @io_uring@ had no mechanism to restrict the number of worker threads, and therefore, when tens of thousands of splice requests are made, it correspondingly creates tens of thousands of internal \glspl{kthrd}. 201 211 Such a high number of \glspl{kthrd} slows Linux significantly. 202 Rather than abandon the experiment, the \CFA web server was switched to @sendfile@.203 204 With a blocking @sendfile@ the\CFA achieves acceptable performance until saturation is reached.205 At saturation, latency increases so some client connectionstimeout.212 Rather than abandon the experiment, the \CFA web server was switched to @sendfile@. 213 214 Starting with \emph{blocking} @sendfile@, \CFA achieves acceptable performance until saturation is reached. 215 At saturation, latency increases and client connections begin to timeout. 206 216 As these clients close their connection, the server must close its corresponding side without delay so the OS can reclaim the resources used by these connections. 207 217 Indeed, until the server connection is closed, the connection lingers in the CLOSE-WAIT TCP state~\cite{rfc:tcp} and the TCP buffers are preserved. 208 However, this poses a problem using nonblocking @sendfile@ calls:218 However, this poses a problem using blocking @sendfile@ calls: 209 219 when @sendfile@ blocks, the \proc rather than the \at blocks, preventing other connections from closing their sockets. 210 220 The call can block if there is insufficient memory, which can be caused by having too many connections in the CLOSE-WAIT state.\footnote{ 211 221 \lstinline{sendfile} can always block even in nonblocking mode if the file to be sent is not in the file-system cache, because Linux does not provide nonblocking disk I/O.} 212 This effect results in a negative feedback where more timeouts lead to more @sendfile@ calls running out of resources. 213 214 Normally, this is address by using @select@/@epoll@ to wait for sockets to have sufficient resources. 215 However, since @io_uring@ respects nonblocking semantics, marking all sockets as non-blocking effectively circumvents the @io_uring@ subsystem entirely: 216 all calls would simply immediately return @EAGAIN@ and all asynchronicity would be lost. 217 218 For this reason, the \CFA webserver sets and resets the @O_NONBLOCK@ flag before and after any calls to @sendfile@. 222 This effect results in a negative feedback loop where more timeouts lead to more @sendfile@ calls running out of resources. 223 224 Normally, this problem is addressed by using @select@/@epoll@ to wait for sockets to have sufficient resources. 225 However, since @io_uring@ does not support @sendfile@ but does respect non\-blocking semantics, marking all sockets as non-blocking effectively circumvents the @io_uring@ subsystem entirely: 226 all calls simply immediately return @EAGAIN@ and all asynchronicity is lost. 227 228 Switching the entire \CFA runtime to @epoll@ for this experiment is unrealistic and does not help in the evaluation of the \CFA runtime. 229 For this reason, the \CFA web server sets and resets the @O_NONBLOCK@ flag before and after any calls to @sendfile@. 219 230 However, when the nonblocking @sendfile@ returns @EAGAIN@, the \CFA server cannot block the \at because its I/O subsystem uses @io_uring@. 220 Therefore, the \at must spin performing the @sendfile@ and yield if the call returns @EAGAIN@. 221 Normally @epoll@ would also be used when these calls to @sendfile@ return @EAGAIN@, but since this would not help in the evaluation of the \CFA runtime, the \CFA webserver simply yields and retries in these cases. 222 223 Interestingly, Linux 5.15 @io_uring@ introduces the ability to limit the number of worker threads that are created, through the @IORING_REGISTER_IOWQ_MAX_WORKERS@ option. 224 Presumably, this limit could prevent the explosion of \glspl{kthrd} which justified using @sendfile@ over @io_uring@ and @splice@. 231 Therefore, the \at spins performing the @sendfile@, yields if the call returns @EAGAIN@ and retries in these cases. 232 233 Interestingly, Linux 5.15 @io_uring@ introduces the ability to limit the number of worker threads that are created through the @IORING_REGISTER_IOWQ_MAX_WORKERS@ option. 234 Presumably, this limit would prevent the explosion of \glspl{kthrd}, which justified using @sendfile@ over @io_uring@ and @splice@. 225 235 However, recall from Section~\ref{iouring} that @io_uring@ maintains two pools of workers: bounded workers and unbounded workers. 226 In the particular case of the webserver, we would want the unbounded workers to handle accepts and reads on socket and bounded workers to handle reading the files from disk. 227 This would allow fine grained countrol over the number of workers needed for each operation type and would presumably lead to good performance. 236 For a web server, the unbounded workers should handle accepts and reads on sockets, and the bounded workers should handle reading files from disk. 237 This setup allows fine-grained control over the number of workers needed for each operation type and presumably leads to good performance. 238 228 239 However, @io_uring@ must contend with another reality of Linux: the versatility of @splice@. 229 Indeed, @splice@ can be used both for reading and writing ,to or from any type of file descriptor.230 This makes it moreambiguous which pool @io_uring@ should delegate @splice@ calls to.231 In the case of splicing from a socket to pipe, @splice@ will behavelike an unbounded operation, but when splicing from a regular file to a pipe, @splice@ becomes a bounded operation.232 To make things more complicated, @splice@ can read from a pipe and write outto a regular file.240 Indeed, @splice@ can be used both for reading and writing to or from any type of file descriptor. 241 This generality makes it ambiguous which pool @io_uring@ should delegate @splice@ calls to. 242 In the case of splicing from a socket to a pipe, @splice@ behaves like an unbounded operation, but when splicing from a regular file to a pipe, @splice@ becomes a bounded operation. 243 To make things more complicated, @splice@ can read from a pipe and write to a regular file. 233 244 In this case, the read is an unbounded operation but the write is a bounded one. 234 245 This leaves @io_uring@ in a difficult situation where it can be very difficult to delegate splice operations to the appropriate type of worker. 235 Since there is little to no context available to @io_uring@, I believe it makes the decisionto always delegate @splice@ operations to the unbounded workers.236 This is unfortunate for this specific experiment, since it prevents the webserver from limiting the number of calls to @splice@ happening in parallelwithout affecting the performance of @read@ or @accept@.246 Since there is little or no context available to @io_uring@, it seems to always delegate @splice@ operations to the unbounded workers. 247 This decision is unfortunate for this specific experiment since it prevents the web server from limiting the number of parallel calls to @splice@ without affecting the performance of @read@ or @accept@. 237 248 For this reason, the @sendfile@ approach described above is still the most performant solution in Linux 5.15. 238 249 239 Note that it could be possible to workaround this problem, for example by creating more @io_uring@ instances so @splice@ operations can be issued to a different instance than the @read@ and @accept@ operations. 240 However, I do not believe this solution is appropriate in general, it simply replaces a hack in the webserver with a different, equivalent hack. 250 One possible workaround is to create more @io_uring@ instances so @splice@ operations can be issued to a different instance than the @read@ and @accept@ operations. 251 However, I do not believe this solution is appropriate in general; 252 it simply replaces my current web server hack with a different, equivalent hack. 241 253 242 254 \subsection{Benchmark Environment} 243 Unlike the Memcached experiment, the web server experiment is run on a heterogeneous environment.255 Unlike the Memcached experiment, the web server experiment is run on a heterogeneous environment. 244 256 \begin{itemize} 245 257 \item 246 258 The server runs Ubuntu 20.04.4 LTS on top of Linux Kernel 5.13.0-52. 247 259 \item 248 It has an AMD Opteron(tm) Processor 6380 running at 2.5GHz. 260 The server computer has four AMD Opteron\texttrademark Processor 6380 with 16 cores running at 2.5GHz, for a total of 64 \glspl{hthrd}. 261 \item 262 The computer is booted with only 8 CPUs enabled, which is sufficient to achieve line rate. 249 263 \item 250 264 Each CPU has 64 KB, 256 KiB and 8 MB of L1, L2 and L3 caches respectively. 251 265 \item 252 The computer is booted with only 8 CPUs enabled, which is sufficient to achieve line rate.253 \item254 266 The computer is booted with only 25GB of memory to restrict the file-system cache. 255 267 \end{itemize} … … 257 269 \begin{itemize} 258 270 \item 259 A client runs a 2.6.11-1 SMP Linux kernel, which permits each client load -generator to run on a separate CPU.271 A client runs a 2.6.11-1 SMP Linux kernel, which permits each client load generator to run on a separate CPU. 260 272 \item 261 273 It has two 2.8 GHz Xeon CPUs, and four one-gigabit Ethernet cards. 262 274 \item 263 \todo{switch} 275 Network routing is performed by an HP 2530 10 Gigabit Ethernet switch. 264 276 \item 265 277 A client machine runs two copies of the workload generator. 266 278 \end{itemize} 267 279 The clients and network are sufficiently provisioned to drive the server to saturation and beyond. 268 Hence, any server effects are attributable solely to the runtime system and web server.269 Finally, without restricting the server hardware resources, it is impossible to determine if a runtime system or the web server using it has any specific design restrictions, \eg using space to reduce time.270 Trying to determine these restriction with large numbers of processors or memory simply means running equally large experiments, which takeslonger and are harder to set up.280 Hence, any server effects are attributable solely to the runtime system and web server. 281 Finally, without restricting the server hardware resources, it is impossible to determine if a runtime system or the web server using it has any specific design restrictions, \eg using space to reduce time. 282 Trying to determine these restrictions with large numbers of processors or memory simply means running equally large experiments, which take longer and are harder to set up. 271 283 272 284 \subsection{Throughput} 273 To measure web server throughput, the server computer is loaded with 21,600 files, sharded across 650 directories, occupying about 2.2GB of disk, distributed over the server's RAID-5 4-drives to achieve high throughput for disk I/O.285 To measure web server throughput, the server computer is loaded with 21,600 files, sharded across 650 directories, occupying about 2.2GB of disk, distributed over the server's RAID-5 4-drives to achieve high throughput for disk I/O. 274 286 The clients run httperf~\cite{httperf} to request a set of static files. 275 The httperf load -generator is used with session files to simulate a large number of users and to implement a partially open-loop system.287 The httperf load generator is used with session files to simulate a large number of users and to implement a partially open-loop system. 276 288 This permits httperf to produce overload conditions, generate multiple requests from persistent HTTP/1.1 connections, and include both active and inactive off periods to model browser processing times and user think times~\cite{Barford98}. 277 289 278 290 The experiments are run with 16 clients, each running a copy of httperf (one copy per CPU), requiring a set of 16 log files with requests conforming to a Zipf distribution. 279 This distribution is representative of users accessing static data through a web -browser.280 Each request reads a file name from its trace, establishes a connection, performs an HTTP get-request for the file name, receive the file data, close the connection, and repeatthe process.291 This distribution is representative of users accessing static data through a web browser. 292 Each request reads a file name from its trace, establishes a connection, performs an HTTP GET request for the file name, receives the file data, closes the connection, and repeats the process. 281 293 Some trace elements have multiple file names that are read across a persistent connection. 282 A client times -out if the server does not complete a request within 10 seconds.294 A client times out if the server does not complete a request within 10 seconds. 283 295 284 296 An experiment consists of running a server with request rates ranging from 10,000 to 70,000 requests per second; 285 297 each rate takes about 5 minutes to complete. 286 There is 20 secondsidle time between rates and between experiments to allow connections in the TIME-WAIT state to clear.298 There are 20 seconds of idle time between rates and between experiments to allow connections in the TIME-WAIT state to clear. 287 299 Server throughput is measured both at peak and after saturation (\ie after peak). 288 300 Peak indicates the level of client requests the server can handle and after peak indicates if a server degrades gracefully. 289 Throughput is measured by aggregating the results from httperf ofall the clients.301 Throughput is measured by aggregating the results from httperf for all the clients. 290 302 291 303 This experiment can be done for two workload scenarios by reconfiguring the server with different amounts of memory: 25 GB and 2.5 GB. … … 305 317 \end{table} 306 318 307 Figure~\ref{fig:swbsrv} shows the results comparing \CFA to NGINX in terms of throughput.308 These results are fairly straightforward.309 Both servers achieve the same throughput until around 57,500 requests per seconds.310 Since the clients are asking for the same files, the fact that the throughput matches exactly is expected as long as both servers are able to serve the desired rate.311 Once the saturation point is reached, both servers are still very close.312 NGINX achieves slightly better throughput.313 However, Figure~\ref{fig:swbsrv:err} shows the rate of errors, a gross approximation of tail latency, where \CFA achieves notably fewer errors once the machine reaches saturation.314 This suggest that \CFA is slightly more fair and NGINX may slightly sacrifice some fairness for improved throughput.315 It demonstrate that the \CFA webserver described above is able to match the performance of NGINX up-to and beyond the saturation point of the machine.316 317 319 \begin{figure} 320 \centering 318 321 \subfloat[][Throughput]{ 319 322 \resizebox{0.85\linewidth}{!}{\input{result.swbsrv.25gb.pstex_t}} … … 325 328 \label{fig:swbsrv:err} 326 329 } 327 \caption[Static Webserver Benchmark : Throughput]{Static Webserver Benchmark : Throughput\smallskip\newline Throughput vs request rate for short lived connectionsconnections.}330 \caption[Static web server Benchmark: Throughput]{Static web server Benchmark: Throughput\smallskip\newline Throughput vs request rate for short-lived connections.} 328 331 \label{fig:swbsrv} 329 332 \end{figure} 330 333 334 Figure~\ref{fig:swbsrv} shows the results comparing \CFA to NGINX in terms of throughput. 335 These results are fairly straightforward. 336 Both servers achieve the same throughput until around 57,500 requests per second. 337 Since the clients are asking for the same files, the fact that the throughput matches exactly is expected as long as both servers are able to serve the request rate. 338 Once the saturation point is reached, both servers are still very close. 339 NGINX achieves slightly better throughput. 340 However, Figure~\ref{fig:swbsrv:err} shows the rate of errors, a gross approximation of tail latency, where \CFA achieves notably fewer errors once the servers reach saturation. 341 This suggests \CFA is slightly fairer with less throughput, while NGINX sacrifices fairness for more throughput. 342 This experiment demonstrates that the \CFA web server is able to match the performance of NGINX up to and beyond the saturation point of the machine. 343 331 344 \subsection{Disk Operations} 332 The throughput was made using a server with 25gb of memory, this was sufficient to hold the entire fileset in addition to all the code and data needed to run the webserver and the rest of the machine. 333 Previous work like \cit{Cite Ashif's stuff} demonstrate that an interesting follow-up experiment is to rerun the same throughput experiment but allowing significantly less memory on the machine. 334 If the machine is constrained enough, it will force the OS to evict files from the file cache and cause calls to @sendfile@ to have to read from disk. 335 However, in this configuration, the problem with @splice@ and @io_uring@ rears its ugly head again. 345 With 25GB of memory, the entire experimental file-set plus the web server and OS fit in memory. 346 If memory is constrained, the OS must evict files from the file cache, which causes @sendfile@ to read from disk.\footnote{ 347 For the in-memory experiments, the file-system cache was warmed by running an experiment three times before measuring started to ensure all files are in the file-system cache.} 348 web servers can behave very differently once file I/O begins and increases. 349 Hence, prior work~\cite{Harji10} suggests running both kinds of experiments to test overall web server performance. 350 351 However, after reducing memory to 2.5GB, the problem with @splice@ and @io_uring@ rears its ugly head again. 336 352 Indeed, in the in-memory configuration, replacing @splice@ with calls to @sendfile@ works because the bounded side basically never blocks. 337 353 Like @splice@, @sendfile@ is in a situation where the read side requires bounded blocking, \eg reading from a regular file, while the write side requires unbounded blocking, \eg blocking until the socket is available for writing. 338 The unbounded side can be handled by yielding when it returns @EAGAIN@ likementioned above, but this trick does not work for the bounded side.354 The unbounded side can be handled by yielding when it returns @EAGAIN@, as mentioned above, but this trick does not work for the bounded side. 339 355 The only solution for the bounded side is to spawn more threads and let these handle the blocking. 340 356 341 Supporting this case in the web server would require creating more \procs or creating a dedicated thread-pool.342 However, since what I am to evaluate in this thesis is the runtime of \CFA, I decided to forgo experiments on low memory server.343 The implementation of the webserver itself is simply too impactful to be an interesting evaluation of the underlying runtime.357 Supporting this case in the web server would require creating more \procs or creating a dedicated thread pool. 358 However, I felt this kind of modification moves too far away from my goal of evaluating the \CFA runtime, \ie it begins writing another runtime system; 359 hence, I decided to forgo experiments on low-memory performance. -
doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex
rebf8ca5 r23a08aa0 4 4 This chapter presents five different experimental setups for evaluating the basic features of the \CFA, libfibre~\cite{libfibre}, Go, and Tokio~\cite{Tokio} schedulers. 5 5 All of these systems have a \gls{uthrding} model. 6 The goal in this chapter is show the \CFA scheduler obtains equivalent performance to other less fairschedulers through the different experiments.7 Note ,only the code of the \CFA tests is shown;8 all tests in the other systems are functionally identical and available online~\cite{ SchedulingBenchmarks}.6 The goal of this chapter is to show that the \CFA scheduler obtains equivalent performance to other, less fair, schedulers through the different experiments. 7 Note that only the code of the \CFA tests is shown; 8 all tests in the other systems are functionally identical and available online~\cite{GITHUB:SchedulingBenchmarks}. 9 9 10 10 \section{Benchmark Environment}\label{microenv} … … 13 13 \begin{description} 14 14 \item[AMD] is a server with two AMD EPYC 7662 CPUs and 256GB of DDR4 RAM. 15 The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}.15 The EPYC CPU has 64 cores with 2 \glspl{hthrd} per core, for a total of 128 \glspl{hthrd} per socket with 2 sockets for a total of 256 \glspl{hthrd}. 16 16 Each CPU has 4 MB, 64 MB and 512 MB of L1, L2 and L3 caches, respectively. 17 Each L1 and L2 instance areonly shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}.17 Each L1 and L2 instance is only shared by \glspl{hthrd} on a given core, but each L3 instance is shared by 4 cores, therefore 8 \glspl{hthrd}. 18 18 The server runs Ubuntu 20.04.2 LTS on top of Linux Kernel 5.8.0-55. 19 19 … … 25 25 \end{description} 26 26 27 For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA node with no hyper 27 For all benchmarks, @taskset@ is used to limit the experiment to 1 NUMA node with no hyperthreading. 28 28 If more \glspl{hthrd} are needed, then 1 NUMA node with hyperthreading is used. 29 29 If still more \glspl{hthrd} are needed, then the experiment is limited to as few NUMA nodes as needed. … … 32 32 On AMD, the same algorithm is used, but the machine only has 2 sockets. 33 33 So hyperthreading\footnote{ 34 Hyperthreading normally refers specifically to the technique used by Intel, however it is often used generically to refer to any equivalent feature.}35 is used when the \proc count reach 65 and 193.36 37 The limited sharing of the last-level cache on the AMD machine is markedly different thanthe Intel machine.34 Hyperthreading normally refers specifically to the technique used by Intel, however, it is often used generically to refer to any equivalent feature.} 35 is used when the \proc count reaches 65 and 193. 36 37 The limited sharing of the last-level cache on the AMD machine is markedly different from the Intel machine. 38 38 Indeed, while on both architectures L2 cache misses that are served by L3 caches on a different CPU incur a significant latency, on the AMD it is also the case that cache misses served by a different L3 instance on the same CPU also incur high latency. 39 39 … … 42 42 Each experiment is run 15 times varying the number of processors depending on the two different computers. 43 43 All experiments gather throughput data and secondary data for scalability or latency. 44 The data is graphed using a solid and two dashed lines representing the median, maximum and minimum resultrespectively, where the minimum/maximum lines are referred to as the \emph{extremes}.\footnote{44 The data is graphed using a solid, a dashed, and a dotted line, representing the median, maximum and minimum results respectively, where the minimum/maximum lines are referred to as the \emph{extremes}.\footnote{ 45 45 An alternative display is to use error bars with min/max as the bottom/top for the bar. 46 46 However, this approach is not truly an error bar around a mean value and I felt the connected lines are easier to read.} … … 48 48 49 49 For each experiment, four graphs are generated showing traditional throughput on the top row and \newterm{scalability} or \newterm{latency} on the bottom row (peek ahead to Figure~\ref{fig:cycle:jax}). 50 Scalability uses the same data as throughput but the Y 50 Scalability uses the same data as throughput but the Y-axis is calculated as the number of \procs over the throughput. 51 51 In this representation, perfect scalability should appear as a horizontal line, \eg, if doubling the number of \procs doubles the throughput, then the relation stays the same. 52 52 53 The left column shows results for 100 cycles per \proc, enough cyclesto always keep every \proc busy.54 The right column shows results for 1 cycleper \proc, where the ready queues are expected to be near empty most of the time.55 The distinction between 100 and 1 cycles is meaningful because the idle sleep subsystem is expected to matter only in the right column, where spurious effects can cause a \proc to run out of work temporarily.53 The left column shows results for hundreds of \ats per \proc, enough to always keep every \proc busy. 54 The right column shows results for very few \ats per \proc, where the ready queues are expected to be near empty most of the time. 55 The distinction between many and few \ats is meaningful because the idle sleep subsystem is expected to matter only in the right column, where spurious effects can cause a \proc to run out of work temporarily. 56 56 57 57 \section{Cycle} … … 62 62 Hence, systems that perform this optimization have an artificial performance benefit because the yield becomes a \emph{nop}. 63 63 For this reason, I designed a different push/pop benchmark, called \newterm{Cycle Benchmark}. 64 This benchmark arranges a number of\ats into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list.65 At runtime, each \at unparks the next \at before parkingitself.66 Unparking the next \at pushes that \at onto the ready queue while the ensuing park leads to a \at being popped from the ready queue.64 This benchmark arranges several \ats into a ring, as seen in Figure~\ref{fig:cycle}, where the ring is a circular singly-linked list. 65 At runtime, each \at unparks the next \at before \glslink{atblock}{parking} itself. 66 Unparking the next \at pushes that \at onto the ready queue while the ensuing \park leads to a \at being popped from the ready queue. 67 67 68 68 \begin{figure} 69 69 \centering 70 70 \input{cycle.pstex_t} 71 \caption[Cycle benchmark]{Cycle benchmark\smallskip\newline Each \at unparks the next \at in the cycle before parkingitself.}71 \caption[Cycle benchmark]{Cycle benchmark\smallskip\newline Each \at unparks the next \at in the cycle before \glslink{atblock}{parking} itself.} 72 72 \label{fig:cycle} 73 73 \end{figure} 74 74 75 75 Therefore, the underlying runtime cannot rely on the number of ready \ats staying constant over the duration of the experiment. 76 In fact, the total number of \ats waiting on the ready queue is expected to vary because of the race between the next \at unparking and the current \at parking.76 In fact, the total number of \ats waiting on the ready queue is expected to vary because of the race between the next \at \glslink{atsched}{unparking} and the current \at \glslink{atblock}{parking}. 77 77 That is, the runtime cannot anticipate that the current task immediately parks. 78 78 As well, the size of the cycle is also decided based on this race, \eg a small cycle may see the chain of unparks go full circle before the first \at parks because of time-slicing or multiple \procs. 79 79 If this happens, the scheduler push and pop are avoided and the results of the experiment are skewed. 80 (Note, an unpark is like a V on a semaphore, so the subsequentpark (P) may not block.)81 Every runtime system must handle this race and cannot optimize daway the ready-queue pushes and pops.82 To prevent any attempt of silently omitting ready-queue operations, the ring of \ats is made big enough so the \ats have time to fully park before being unparked again.80 (Note, an \unpark is like a V on a semaphore, so the subsequent \park (P) may not block.) 81 Every runtime system must handle this race and cannot optimize away the ready-queue pushes and pops. 82 To prevent any attempt of silently omitting ready-queue operations, the ring of \ats is made big enough so the \ats have time to fully \park before being unparked again. 83 83 Finally, to further mitigate any underlying push/pop optimizations, especially on SMP machines, multiple rings are created in the experiment. 84 84 85 85 Figure~\ref{fig:cycle:code} shows the pseudo code for this benchmark, where each cycle has 5 \ats. 86 There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw @park@/@unpark@and carefully picking the order of the @P@ and @V@ with respect to the loop condition.86 There is additional complexity to handle termination (not shown), which requires a binary semaphore or a channel instead of raw \park/\unpark and carefully picking the order of the @P@ and @V@ with respect to the loop condition. 87 87 88 88 \begin{figure} … … 99 99 } 100 100 \end{cfa} 101 \caption[Cycle Benchmark : Pseudo Code]{Cycle Benchmark: Pseudo Code}101 \caption[Cycle Benchmark: Pseudo Code]{Cycle Benchmark: Pseudo Code} 102 102 \label{fig:cycle:code} 103 %\end{figure}104 105 103 \bigskip 106 107 %\begin{figure}108 104 \subfloat[][Throughput, 100 cycles per \proc]{ 109 105 \resizebox{0.5\linewidth}{!}{ … … 131 127 \label{fig:cycle:jax:low:ns} 132 128 } 133 \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are maximums while the solid line is the medium.} 129 \caption[Cycle Benchmark on Intel]{Cycle Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle counts. 130 For throughput, higher is better, for scalability, lower is better. 131 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 134 132 \label{fig:cycle:jax} 135 133 \end{figure} … … 161 159 \label{fig:cycle:nasus:low:ns} 162 160 } 163 \caption[Cycle Benchmark on AMD]{Cycle Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.} 161 \caption[Cycle Benchmark on AMD]{Cycle Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, 5 \ats per cycle, and different cycle counts. 162 For throughput, higher is better, for scalability, lower is better. 163 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 164 164 \label{fig:cycle:nasus} 165 165 \end{figure} … … 167 167 \subsection{Results} 168 168 169 For the Intel architecture, Figure~\ref{fig:cycle:jax}: 170 \begin{itemize} 171 \item 172 For 100 cycles per \proc (first column), \CFA, Go and Tokio all obtain effectively the same throughput performance. 169 Figures~\ref{fig:cycle:jax} and \ref{fig:cycle:nasus} show the results for the cycle experiment on Intel and AMD, respectively. 170 Looking at the left column on Intel, Figures~\ref{fig:cycle:jax:ops} and \ref{fig:cycle:jax:ns} show the results for 100 cycles of 5 \ats for each \proc. 171 \CFA, Go and Tokio all obtain effectively the same throughput performance. 173 172 Libfibre is slightly behind in this case but still scales decently. 174 As a result of the \gls{kthrd} placement, additional \procs from 25 to 48 offer less performance improvement (flatting of the line) for all runtimes. 175 As expected, this pattern repeats again between \proc count 72 and 96. 176 \item 177 For 1 cycle per \proc, \CFA and Tokio obtain very similar results overall, but Tokio shows more variations in the results. 178 Go achieves slightly better performance. 173 As a result of the \gls{kthrd} placement, additional \procs from 25 to 48 offer less performance improvement for all runtimes, which can be seen as a flattening of the line. 174 This effect even causes a decrease in throughput in libfibre's case. 175 As expected, this pattern repeats between \proc count 72 and 96. 176 177 Looking next at the right column on Intel, Figures~\ref{fig:cycle:jax:low:ops} and \ref{fig:cycle:jax:low:ns} show the results for 1 cycle of 5 \ats for each \proc. 178 \CFA and Tokio obtain very similar results overall, but Tokio shows more variations in the results. 179 Go achieves slightly better performance than \CFA and Tokio, but all three display significantly worst performance compared to the left column. 180 This decrease in performance is likely due to the additional overhead of the idle-sleep mechanism. 181 This can either be the result of \procs actually running out of work or simply additional overhead from tracking whether or not there is work available. 182 Indeed, unlike the left column, it is likely that the ready queue is transiently empty, which likely triggers additional synchronization steps. 179 183 Interestingly, libfibre achieves better performance with 1 cycle. 180 \end{itemize} 181 182 For the AMD architecture, Figure~\ref{fig:cycle:nasus}, the results show the same story as on the Intel, with close to double the performance overall but with slightly increased variation. 183 The different performance improvements and plateaus are due to cache topology and appear at the expected \proc counts of 64, 128 and 192, for the same reasons as on Intel. 184 \begin{itemize} 185 \item 186 For 100 cycles per \proc, unlike Intel, all 4 runtimes achieve very similar throughput and scalability. 187 \item 188 For 1 cycle per \proc, unlike on Intel, Tokio and Go have the same throughput performance, while \CFA is slightly slower. 189 Again, the same performance increase for libfibre is visible. 190 \end{itemize} 184 185 Looking now at the results for the AMD architecture, Figure~\ref{fig:cycle:nasus}, the results are overall similar to the Intel results, but with close to double the performance, slightly increased variation, and some differences in the details. 186 Note the maximum of the Y-axis on Intel and AMD differ significantly. 187 Looking at the left column on AMD, Figures~\ref{fig:cycle:nasus:ops} and \ref{fig:cycle:nasus:ns} all 4 runtimes achieve very similar throughput and scalability. 188 However, as the number of \procs grows higher, the results on AMD show notably more variability than on Intel. 189 The different performance improvements and plateaus are due to cache topology and appear at the expected: \proc counts of 64, 128 and 192, for the same reasons as on Intel. 190 Looking next at the right column on AMD, Figures~\ref{fig:cycle:nasus:low:ops} and \ref{fig:cycle:nasus:low:ns}, Tokio and Go have the same throughput performance, while \CFA is slightly slower. 191 This result is different than on Intel, where Tokio behaved like \CFA rather than behaving like Go. 192 Again, the same performance increase for libfibre is visible when running fewer \ats. 191 193 Note, I did not investigate the libfibre performance boost for 1 cycle in this experiment. 192 194 193 The conclusion from both architectures is that all of the compared runtime have fairly equivalent performance for this micro-benchmark.194 Clearly, the pathological case with 1 \at per \proc,can affect fairness algorithms managing mostly idle processors, \eg \CFA, but only at high core counts.195 Forthis case, \emph{any} helping is likely to cause a cascade of \procs running out of work and attempting to steal.196 For this experiment, the \CFA scheduler has achieved the goal of obtaining equivalent performance to other less fair schedulers, except for very unusual workloads.195 The conclusion from both architectures is that all of the compared runtimes have fairly equivalent performance for this micro-benchmark. 196 Clearly, the pathological case with 1 cycle per \proc can affect fairness algorithms managing mostly idle processors, \eg \CFA, but only at high core counts. 197 In this case, \emph{any} helping is likely to cause a cascade of \procs running out of work and attempting to steal. 198 For this experiment, the \CFA scheduler has achieved the goal of obtaining equivalent performance to other, less fair, schedulers. 197 199 198 200 \section{Yield} 199 201 200 For complet ion, the classic yield benchmark is included.202 For completeness, the classic yield benchmark is included. 201 203 Here, the throughput is dominated by the mechanism used to handle the @yield@ function. 202 204 Figure~\ref{fig:yield:code} shows pseudo code for this benchmark, where the cycle @wait/next.wake@ is replaced by @yield@. … … 216 218 } 217 219 \end{cfa} 218 \caption[Yield Benchmark : Pseudo Code]{Yield Benchmark: Pseudo Code}220 \caption[Yield Benchmark: Pseudo Code]{Yield Benchmark: Pseudo Code} 219 221 \label{fig:yield:code} 220 222 %\end{figure} … … 227 229 \label{fig:yield:jax:ops} 228 230 } 229 \subfloat[][Throughput, 1 \at sper \proc]{231 \subfloat[][Throughput, 1 \at per \proc]{ 230 232 \resizebox{0.5\linewidth}{!}{ 231 233 \input{result.yield.low.jax.ops.pstex_t} … … 240 242 \label{fig:yield:jax:ns} 241 243 } 242 \subfloat[][Scalability, 1 \at sper \proc]{244 \subfloat[][Scalability, 1 \at per \proc]{ 243 245 \resizebox{0.5\linewidth}{!}{ 244 246 \input{result.yield.low.jax.ns.pstex_t} … … 246 248 \label{fig:yield:jax:low:ns} 247 249 } 248 \caption[Yield Benchmark on Intel]{Yield Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count, using 1 \ats per \proc. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.} 250 \caption[Yield Benchmark on Intel]{Yield Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count. 251 For throughput, higher is better, for scalability, lower is better. 252 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 249 253 \label{fig:yield:jax} 250 254 \end{figure} … … 252 256 \subsection{Results} 253 257 254 Figures~\ref{fig:yield:jax} and~\ref{fig:yield:nasus} show the same throughput graphs as @cycle@ on Intel and AMD, respectively. 255 Note, the Y-axis on the yield graph for Intel is twice as large as the Intel cycle-graph. 256 A visual glance between the cycle and yield graphs confirms my claim that the yield benchmark is unreliable. 257 258 For the Intel architecture, Figure~\ref{fig:yield:jax}: 259 \begin{itemize} 260 \item 258 Figures~\ref{fig:yield:jax} and \ref{fig:yield:nasus} show the results for the yield experiment on Intel and AMD, respectively. 259 Looking at the left column on Intel, Figures~\ref{fig:yield:jax:ops} and \ref{fig:yield:jax:ns} show the results for 100 \ats for each \proc. 260 Note that the Y-axis on this graph is twice as large as the Intel cycle graph. 261 A visual glance between the left columns of the cycle and yield graphs confirms my claim that the yield benchmark is unreliable. 261 262 \CFA has no special handling for @yield@, but this experiment requires less synchronization than the @cycle@ experiment. 262 Hence, the @yield@ throughput and scalability graphs for both 100 and 1 cycles/tasks per processor have similar shapes to the corresponding @cycle@ graphs. 263 The only difference is sightly better performance for @yield@ because of less synchronization. 264 As for @cycle@, the cost of idle sleep also comes into play in a very significant way in Figure~\ref{fig:yield:jax:low:ns}, where the scaling is not flat. 265 \item 266 libfibre has special handling for @yield@ using the fact that the number of ready fibres does not change, and therefore, by-passing the idle-sleep mechanism entirely. 267 Additionally, when only running 1 \at per \proc, libfibre optimizes further, and forgoes the context-switch entirely. 268 Hence, libfibre behaves very differently in the cycle and yield benchmarks, with a 4 times increase in performance for 100 cycles/tasks and an 8 times increase for 1 cycle/task. 269 \item 270 Go has special handling for @yield@ by putting a yielding goroutine on a secondary global ready-queue, giving it lower priority. 263 Hence, the @yield@ throughput and scalability graphs have similar shapes to the corresponding @cycle@ graphs. 264 The only difference is slightly better performance for @yield@ because of less synchronization. 265 Libfibre has special handling for @yield@ using the fact that the number of ready fibres does not change, and therefore, bypassing the idle-sleep mechanism entirely. 266 Hence, libfibre behaves very differently in the cycle and yield benchmarks, with a 4 times increase in performance on the left column. 267 Go has special handling for @yield@ by putting a yielding goroutine on a secondary global ready-queue, giving it a lower priority. 271 268 The result is that multiple \glspl{hthrd} contend for the global queue and performance suffers drastically. 272 Hence, Go behaves very differently in the cycle and yield benchmarks, with a complete performance collapse in @yield@ for both 100 and 1 cycles/tasks. 273 \item 269 Hence, Go behaves very differently in the cycle and yield benchmarks, with a complete performance collapse in @yield@. 274 270 Tokio has a similar performance collapse after 16 processors, and therefore, its special @yield@ handling is probably related to a Go-like scheduler problem and/or a \CFA idle-sleep problem. 275 271 (I did not dig through the Rust code to ascertain the exact reason for the collapse.) 276 \end{itemize} 272 Note that since there is no communication among \ats, locality problems are much less likely than for the cycle benchmark. 273 This lack of communication is probably why the plateaus due to topology are not present. 274 275 Looking next at the right column on Intel, Figures~\ref{fig:yield:jax:low:ops} and \ref{fig:yield:jax:low:ns} show the results for 1 \at for each \proc. 276 As for @cycle@, \CFA's cost of idle sleep comes into play in a very significant way in Figure~\ref{fig:yield:jax:low:ns}, where the scaling is not flat. 277 This result is to be expected since fewer \ats mean \procs are more likely to run out of work. 278 On the other hand, when only running 1 \at per \proc, libfibre optimizes further and forgoes the context switch entirely. 279 This results in libfibre outperforming other runtimes, even more, achieving 8 times more throughput than for @cycle@. 280 Finally, Go and Tokio's performance collapse is still the same with fewer \ats. 281 The only exception is Tokio running on 24 \procs, deepening the mystery of its yielding mechanism further. 277 282 278 283 \begin{figure} … … 302 307 \label{fig:yield:nasus:low:ns} 303 308 } 304 \caption[Yield Benchmark on AMD]{Yield Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count, using 1 \ats per \proc. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.} 309 \caption[Yield Benchmark on AMD]{Yield Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count. 310 For throughput, higher is better, for scalability, lower is better. 311 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 305 312 \label{fig:yield:nasus} 306 313 \end{figure} 307 314 308 For the AMD architecture, Figure~\ref{fig:yield:nasus}, the results show the same story as on the Intel, with slightly increased variations. 309 Also, some transition points on the X-axis differ because of the architectures, like at 16 versus 24 processors. 310 311 It is difficult to draw conclusions for this benchmark when runtime system treat @yield@ so differently. 315 Looking now at the results for the AMD architecture, Figure~\ref{fig:yield:nasus}, the results again show a story that is overall similar to the results on the Intel, with increased variation and some differences in the details. 316 Note that the maximum of the Y-axis on Intel and AMD differ less in @yield@ than @cycle@. 317 Looking at the left column first, Figures~\ref{fig:yield:nasus:ops} and \ref{fig:yield:nasus:ns}, \CFA achieves very similar throughput and scaling. 318 Libfibre still outpaces all other runtimes, but it encounters a performance hit at 64 \procs. 319 This anomaly suggests some amount of communication between the \procs that the Intel machine is able to mask where the AMD is not once hyperthreading is needed. 320 Go and Tokio still display the same performance collapse as on Intel. 321 Looking next at the right column on AMD, Figures~\ref{fig:yield:nasus:low:ops} and \ref{fig:yield:nasus:low:ns}, all runtime systems effectively behave the same as they did on the Intel machine. 322 At the high \ats count, the only difference is Libfibre's scaling and this difference disappears on the right column. 323 This behaviour suggests whatever communication issue it encountered on the left is completely circumvented on the right. 324 325 It is difficult to draw conclusions for this benchmark when runtime systems treat @yield@ so differently. 312 326 The win for \CFA is its consistency between the cycle and yield benchmarks making it simpler for programmers to use and understand, \ie the \CFA semantics match with programmer intuition. 313 327 … … 315 329 \section{Churn} 316 330 317 The Cycle and Yield benchmark represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application.318 In these benchmarks ,\ats can be easily partitioned over the different \procs upfront and none of the \ats communicate with each other.319 320 The Churn benchmark represents more chaotic executions, where there is more communication among \ats but no relationship between the last \proc on which a \at ran and blocked and the \proc that subsequently unblocks it.331 The Cycle and Yield benchmarks represent an \emph{easy} scenario for a scheduler, \eg an embarrassingly parallel application. 332 In these benchmarks \ats can be easily partitioned over the different \procs upfront and none of the \ats communicate with each other. 333 334 The Churn benchmark represents more chaotic executions, where there is more communication among \ats but no relationship between the last \proc on which a \at ran and blocked, and the \proc that subsequently unblocks it. 321 335 With processor-specific ready-queues, when a \at is unblocked by a different \proc that means the unblocking \proc must either ``steal'' the \at from another processor or find it on a remote queue. 322 336 This dequeuing results in either contention on the remote queue and/or \glspl{rmr} on the \at data structure. 323 Hence, this benchmark has performance dominated by the cache traffic as \proc are constantly accessing theeach other's data.324 In either case, this benchmark aims to measure how well a scheduler handles these cases ,since both cases can lead to performance degradation if not handled correctly.337 Hence, this benchmark has performance dominated by the cache traffic as \procs are constantly accessing each other's data. 338 In either case, this benchmark aims to measure how well a scheduler handles these cases since both cases can lead to performance degradation if not handled correctly. 325 339 326 340 This benchmark uses a fixed-size array of counting semaphores. 327 Each \at picks a random semaphore, @V@s it to unblock any waiting \at, and then @P@s (maybe blocks) the \at son the semaphore.341 Each \at picks a random semaphore, @V@s it to unblock any waiting \at, and then @P@s (maybe blocks) the \at on the semaphore. 328 342 This creates a flow where \ats push each other out of the semaphores before being pushed out themselves. 329 For this benchmark to work, the number of \ats must be equal or greater than the number of semaphores plus the number of \procs;330 \eg if there are 10 semaphores and 5 \procs, but only 3 \ats, all 3 \ats can block (P) on a random semaphore and now there isno \ats to unblock (V) them.331 Note , the nature of these semaphores meanthe counter can go beyond 1, which can lead to nonblocking calls to @P@.343 For this benchmark to work, the number of \ats must be equal to or greater than the number of semaphores plus the number of \procs; 344 \eg if there are 10 semaphores and 5 \procs, but only 3 \ats, all 3 \ats can block (P) on a random semaphore and now there are no \ats to unblock (V) them. 345 Note that the nature of these semaphores means the counter can go beyond 1, which can lead to nonblocking calls to @P@. 332 346 Figure~\ref{fig:churn:code} shows pseudo code for this benchmark, where the @yield@ is replaced by @V@ and @P@. 333 347 … … 346 360 } 347 361 \end{cfa} 348 \caption[Churn Benchmark : Pseudo Code]{Churn Benchmark: Pseudo Code}362 \caption[Churn Benchmark: Pseudo Code]{Churn Benchmark: Pseudo Code} 349 363 \label{fig:churn:code} 350 364 %\end{figure} … … 364 378 } 365 379 366 \subfloat[][ Latency, 100 \ats per \proc]{380 \subfloat[][Scalability, 100 \ats per \proc]{ 367 381 \resizebox{0.5\linewidth}{!}{ 368 382 \input{result.churn.jax.ns.pstex_t} … … 370 384 \label{fig:churn:jax:ns} 371 385 } 372 \subfloat[][ Latency, 2 \ats per \proc]{386 \subfloat[][Scalability, 2 \ats per \proc]{ 373 387 \resizebox{0.5\linewidth}{!}{ 374 388 \input{result.churn.low.jax.ns.pstex_t} … … 376 390 \label{fig:churn:jax:low:ns} 377 391 } 378 \caption[Churn Benchmark on Intel]{\centering Churn Benchmark on Intel\smallskip\newline Throughput and latency of the Churn on the benchmark on the Intel machine. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.} 392 \caption[Churn Benchmark on Intel]{Churn Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count. 393 For throughput, higher is better, for scalability, lower is better. 394 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 379 395 \label{fig:churn:jax} 380 396 \end{figure} … … 382 398 \subsection{Results} 383 399 384 Figures~\ref{fig:churn:jax} and Figure~\ref{fig:churn:nasus} show the throughput on Intel and AMD respectively. 385 386 The performance cost of crossing the cache boundaries is still visible at the same \proc count. 387 388 Scalability is notably worst than the previous benchmarks since there is inherently more communication between processors. 389 Indeed, once the number of \glspl{hthrd} goes beyond a single socket, performance ceases to improve. 400 Figures~\ref{fig:churn:jax} and Figure~\ref{fig:churn:nasus} show the results for the churn experiment on Intel and AMD, respectively. 401 Looking at the left column on Intel, Figures~\ref{fig:churn:jax:ops} and \ref{fig:churn:jax:ns} show the results for 100 \ats for each \proc, and all runtimes obtain fairly similar throughput for most \proc counts. 402 \CFA does very well on a single \proc but quickly loses its advantage over the other runtimes. 403 As expected, it scales decently up to 48 \procs, drops from 48 to 72 \procs, and then plateaus. 404 Tokio achieves very similar performance to \CFA, with the starting boost, scaling decently until 48 \procs, drops from 48 to 72 \procs, and starts increasing again to 192 \procs. 405 Libfibre obtains effectively the same results as Tokio with slightly less scaling, \ie the scaling curve is the same but with slightly lower values. 406 Finally, Go gets the most peculiar results, scaling worst than other runtimes until 48 \procs. 407 At 72 \procs, the results of the Go runtime vary significantly, sometimes scaling sometimes plateauing. 408 However, beyond this point Go keeps this level of variation but does not scale further in any of the runs. 409 410 Throughput and scalability are notably worst for all runtimes than the previous benchmarks since there is inherently more communication between processors. 411 Indeed, none of the runtimes reach 40 million operations per second while in the cycle benchmark all but libfibre reached 400 million operations per second. 412 Figures~\ref{fig:churn:jax:ns} and \ref{fig:churn:jax:low:ns} show that for all \proc counts, all runtimes produce poor scaling. 413 However, once the number of \glspl{hthrd} goes beyond a single socket, at 48 \procs, scaling goes from bad to worst and performance completely ceases to improve. 414 At this point, the benchmark is dominated by inter-socket communication costs for all runtimes. 415 390 416 An interesting aspect to note here is that the runtimes differ in how they handle this situation. 391 Indeed, when a \proc unparks a \at that was last run on a different \proc, the \at could be appended to the ready-queue local \proc or to the ready-queue of the remote \proc, which previously ran the \at. 392 \CFA, Tokio and Go all use the approach of unparking to the local \proc while Libfibre unparks to the remote \proc. 393 In this particular benchmark, the inherent chaos of the benchmark in addition to small memory footprint means neither approach wins over the other. 417 Indeed, when a \proc unparks a \at that was last run on a different \proc, the \at could be appended to the ready queue of the local \proc or to the ready queue of the remote \proc, which previously ran the \at. 418 \CFA, Tokio and Go all use the approach of \glslink{atsched}{unparking} to the local \proc, while Libfibre unparks to the remote \proc. 419 In this particular benchmark, the inherent chaos of the benchmark, in addition to the small memory footprint, means neither approach wins over the other. 420 421 Looking next at the right column on Intel, Figures~\ref{fig:churn:jax:low:ops} and \ref{fig:churn:jax:low:ns} show the results for 1 \at for each \proc, and many of the differences between the runtimes disappear. 422 \CFA outperforms other runtimes by a minuscule margin. 423 Libfibre follows very closely behind with basically the same performance and scaling. 424 Tokio maintains effectively the same curve shapes as \CFA and libfibre, but it incurs extra costs for all \proc counts. 425 While Go maintains overall similar results to the others, it again encounters significant variation at high \proc counts. 426 Inexplicably resulting in super-linear scaling for some runs, \ie the scalability curves display a negative slope. 427 428 Interestingly, unlike the cycle benchmark, running with fewer \ats does not produce drastically different results. 429 In fact, the overall throughput stays almost exactly the same on the left and right columns. 394 430 395 431 \begin{figure} … … 407 443 } 408 444 409 \subfloat[][ Latency, 100 \ats per \proc]{445 \subfloat[][Scalability, 100 \ats per \proc]{ 410 446 \resizebox{0.5\linewidth}{!}{ 411 447 \input{result.churn.nasus.ns.pstex_t} … … 413 449 \label{fig:churn:nasus:ns} 414 450 } 415 \subfloat[][ Latency, 2 \ats per \proc]{451 \subfloat[][Scalability, 2 \ats per \proc]{ 416 452 \resizebox{0.5\linewidth}{!}{ 417 453 \input{result.churn.low.nasus.ns.pstex_t} … … 419 455 \label{fig:churn:nasus:low:ns} 420 456 } 421 \caption[Churn Benchmark on AMD]{\centering Churn Benchmark on AMD\smallskip\newline Throughput and latency of the Churn on the benchmark on the AMD machine. 422 For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.} 457 \caption[Churn Benchmark on AMD]{Churn Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count. 458 For throughput, higher is better, for scalability, lower is better. 459 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 423 460 \label{fig:churn:nasus} 424 461 \end{figure} 425 462 426 Like for the cycle benchmark, here all runtimes achieve fairly similar performance. 427 Performance improves as long as all \procs fit on a single socket. 428 Beyond that performance starts to suffer from increased caching costs. 429 430 Indeed on Figures~\ref{fig:churn:jax:ops} and \ref{fig:churn:jax:ns} show that with 1 and 100 \ats per \proc, \CFA, libfibre, Go and Tokio achieve effectively equivalent performance for most \proc count. 431 432 However, Figure~\ref{fig:churn:nasus} again shows a somewhat different story on AMD. 433 While \CFA, libfibre, and Tokio achieve effectively equivalent performance for most \proc count, Go starts with better scaling at very low \proc counts but then performance quickly plateaus, resulting in worse performance at higher \proc counts. 434 This performance difference is visible at both high and low \at counts. 435 436 One possible explanation for this difference is that since Go has very few available concurrent primitives, a channel was used instead of a semaphore. 437 On paper a semaphore can be replaced by a channel and with zero-sized objects passed along equivalent performance could be expected. 438 However, in practice there can be implementation difference between the two. 439 This is especially true if the semaphore count can get somewhat high. 440 Note that this replacement is also made in the cycle benchmark, however in that context it did not seem to have a notable impact. 441 442 As second possible explanation is that Go may sometimes use the heap when allocating variables based on the result of escape analysis of the code. 443 It is possible that variables that should be placed on the stack are placed on the heap. 444 This could cause extra pointer chasing in the benchmark, heightening locality effects. 445 Depending on how the heap is structure, this could also lead to false sharing. 446 447 The objective of this benchmark is to demonstrate that unparking \ats from remote \procs do not cause too much contention on the local queues. 448 Indeed, the fact all runtimes achieve some scaling at lower \proc count demonstrate that migrations do not need to be serialized. 449 Again these result demonstrate \CFA achieves satisfactory performance. 463 464 Looking now at the results for the AMD architecture, Figure~\ref{fig:churn:nasus}, the results show a somewhat different story. 465 Looking at the left column first, Figures~\ref{fig:churn:nasus:ops} and \ref{fig:churn:nasus:ns}, \CFA, Libfibre and Tokio all produce decent scalability. 466 \CFA suffers particularly from larger variations at higher \proc counts, but largely outperforms the other runtimes. 467 Go still produces intriguing results in this case and even more intriguingly, the results have fairly low variation. 468 469 One possible explanation for Go's difference is that it has very few available concurrent primitives, so a channel is substituted for a semaphore. 470 On paper, a semaphore can be replaced by a channel, and with zero-sized objects passed through the channel, equivalent performance could be expected. 471 However, in practice, there are implementation differences between the two, \eg if the semaphore count can get somewhat high so objects accumulate in the channel. 472 Note that this substitution is also made in the cycle benchmark; 473 however, in that context, it did not have a notable impact. 474 475 A second possible explanation is that Go may use the heap when allocating variables based on the result of the escape analysis of the code. 476 It is possible for variables that could be placed on the stack to instead be placed on the heap. 477 This placement could cause extra pointer chasing in the benchmark, heightening locality effects. 478 Depending on how the heap is structured, this could also lead to false sharing. 479 I did not investigate what causes these unusual results. 480 481 Looking next at the right column, Figures~\ref{fig:churn:nasus:low:ops} and \ref{fig:churn:nasus:low:ns}, as for Intel, all runtimes obtain overall similar throughput between the left and right column. 482 \CFA, Libfibre and Tokio all have very close results. 483 Go still suffers from poor scalability but is now unusual in a different way. 484 While it obtains effectively constant performance regardless of \proc count, this ``sequential'' performance is higher than the other runtimes for low \proc count. 485 Up to 32 \procs, after which the other runtimes manage to outscale Go. 486 487 In conclusion, the objective of this benchmark is to demonstrate that \glslink{atsched}{unparking} \ats from remote \procs does not cause too much contention on the local queues. 488 Indeed, the fact that most runtimes achieve some scaling between various \proc counts demonstrates migrations do not need to be serialized. 489 Again these results demonstrate that \CFA achieves satisfactory performance compared to the other runtimes. 450 490 451 491 \section{Locality} 492 493 As mentioned in the churn benchmark, when \glslink{atsched}{unparking} a \at, it is possible to either \unpark to the local or remote ready-queue.\footnote{ 494 It is also possible to \unpark to a third unrelated ready-queue, but without additional knowledge about the situation, it is likely to degrade performance.} 495 The locality experiment includes two variations of the churn benchmark, where a data array is added. 496 In both variations, before @V@ing the semaphore, each \at calls a @work@ function which increments random cells inside the data array. 497 In the noshare variation, the array is not passed on and each thread continuously accesses its private array. 498 In the share variation, the array is passed to another thread via the semaphore's shadow queue (each blocking thread can save a word of user data in its blocking node), transferring ownership of the array to the woken thread. 499 Figure~\ref{fig:locality:code} shows the pseudo code for this benchmark. 500 501 The objective here is to highlight the different decisions made by the runtime when \glslink{atsched}{unparking}. 502 Since each thread unparks a random semaphore, it means that it is unlikely that a \at is unparked from the last \proc it ran on. 503 In the noshare variation, \glslink{atsched}{unparking} the \at on the local \proc is an appropriate choice since the data was last modified on that \proc. 504 In the shared variation, \glslink{atsched}{unparking} the \at on a remote \proc is an appropriate choice. 505 506 The expectation for this benchmark is to see a performance inversion, where runtimes fare notably better in the variation which matches their \glslink{atsched}{unparking} policy. 507 This decision should lead to \CFA, Go and Tokio achieving better performance in the share variation while libfibre achieves better performance in noshare. 508 Indeed, \CFA, Go and Tokio have the default policy of \glslink{atsched}{unparking} \ats on the local \proc, whereas libfibre has the default policy of \glslink{atsched}{unparking} \ats wherever they last ran. 452 509 453 510 \begin{figure} … … 493 550 \end{lrbox} 494 551 495 \subfloat[ Thread$_1$]{\label{f:CFibonacci}\usebox\myboxA}552 \subfloat[Noshare]{\label{fig:locality:code:T1}\usebox\myboxA} 496 553 \hspace{3pt} 497 554 \vrule 498 555 \hspace{3pt} 499 \subfloat[ Thread$_2$]{\label{f:CFAFibonacciGen}\usebox\myboxB}500 501 \caption[Locality Benchmark : Pseudo Code]{Locality Benchmark: Pseudo Code}556 \subfloat[Share]{\label{fig:locality:code:T2}\usebox\myboxB} 557 558 \caption[Locality Benchmark: Pseudo Code]{Locality Benchmark: Pseudo Code} 502 559 \label{fig:locality:code} 503 560 \end{figure} 504 561 505 As mentioned in the churn benchmark, when unparking a \at, it is possible to either unpark to the local or remote ready-queue.506 \footnote{It is also possible to unpark to a third unrelated ready-queue, but without additional knowledge about the situation, there is little to suggest this would not degrade performance.}507 The locality experiment includes two variations of the churn benchmark, where an array of data is added.508 In both variations, before @V@ing the semaphore, each \at increment random cells inside the array.509 The @share@ variation then passes the array to the shadow-queue of the semaphore, transferring ownership of the array to the woken thread.510 In the @noshare@ variation the array is not passed on and each thread continuously accesses its private array.511 512 The objective here is to highlight the different decision made by the runtime when unparking.513 Since each thread unparks a random semaphore, it means that it is unlikely that a \at will be unparked from the last \proc it ran on.514 In the @share@ version, this means that unparking the \at on the local \proc is appropriate since the data was last modified on that \proc.515 In the @noshare@ version, the unparking the \at on the remote \proc is the appropriate approach.516 517 The expectation for this benchmark is to see a performance inversion, where runtimes will fare notably better in the variation which matches their unparking policy.518 This should lead to \CFA, Go and Tokio achieving better performance in @share@ while libfibre achieves better performance in @noshare@.519 Indeed, \CFA, Go and Tokio have the default policy of unpark \ats on the local \proc, where as libfibre has the default policy of unparks \ats wherever they last ran.520 521 562 \subsection{Results} 522 563 564 Figures~\ref{fig:locality:jax} and \ref{fig:locality:nasus} show the results for the locality experiment on Intel and AMD, respectively. 565 In both cases, the graphs on the left column show the results for the share variation and the graphs on the right column show the results for the noshare. 566 Looking at the left column on Intel, Figures~\ref{fig:locality:jax:share:ops} and \ref{fig:locality:jax:share:ns} show the results for the share variation. 567 \CFA and Tokio slightly outperform libfibre, as expected, based on their \ats placement approach. 568 \CFA and Tokio both \unpark locally and do not suffer cache misses on the transferred array. 569 Libfibre, on the other hand, unparks remotely, and as such the unparked \at is likely to miss on the shared data. 570 Go trails behind in this experiment, presumably for the same reasons that were observable in the churn benchmark. 571 Otherwise, the results are similar to the churn benchmark, with lower throughput due to the array processing. 572 As for most previous results, all runtimes suffer a performance hit after 48 \procs, which is the socket boundary, and climb again from 96 to 192 \procs. 573 523 574 \begin{figure} 524 575 \subfloat[][Throughput share]{ … … 547 598 \label{fig:locality:jax:noshare:ns} 548 599 } 549 \caption[Locality Benchmark on Intel]{Locality Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.} 600 \caption[Locality Benchmark on Intel]{Locality Benchmark on Intel\smallskip\newline Throughput and scalability as a function of \proc count. 601 For throughput, higher is better, for scalability, lower is better. 602 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 550 603 \label{fig:locality:jax} 551 604 \end{figure} 605 552 606 \begin{figure} 553 607 \subfloat[][Throughput share]{ … … 576 630 \label{fig:locality:nasus:noshare:ns} 577 631 } 578 \caption[Locality Benchmark on AMD]{Locality Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count. For throughput, higher is better, for scalability, lower is better. Each series represent 15 independent runs, the dotted lines are extremes while the solid line is the medium.} 632 \caption[Locality Benchmark on AMD]{Locality Benchmark on AMD\smallskip\newline Throughput and scalability as a function of \proc count. 633 For throughput, higher is better, for scalability, lower is better. 634 Each series represent 15 independent runs, the dashed lines are the maximums of each series while the solid lines are the median and the dotted lines are the minimums.} 579 635 \label{fig:locality:nasus} 580 636 \end{figure} 581 637 582 Figures~\ref{fig:locality:jax} and \ref{fig:locality:nasus} shows the results on Intel and AMD respectively. 583 In both cases, the graphs on the left column show the results for the @share@ variation and the graphs on the right column show the results for the @noshare@. 584 585 On Intel, Figure~\ref{fig:locality:jax} shows Go trailing behind the 3 other runtimes. 586 On the left of the figure showing the results for the shared variation, where \CFA and Tokio slightly outperform libfibre as expected. 587 And correspondingly on the right, we see the expected performance inversion where libfibre now outperforms \CFA and Tokio. 588 Otherwise the results are similar to the churn benchmark, with lower throughput due to the array processing. 589 Presumably the reason why Go trails behind are the same as in Figure~\ref{fig:churn:nasus}. 590 591 Figure~\ref{fig:locality:nasus} shows the same experiment on AMD. 592 \todo{why is cfa slower?} 593 Again, we see the same story, where Tokio and libfibre swap places and Go trails behind. 638 Looking at the right column on Intel, Figures~\ref{fig:locality:jax:noshare:ops} and \ref{fig:locality:jax:noshare:ns} show the results for the noshare variation. 639 The graphs show the expected performance inversion where libfibre now outperforms \CFA and Tokio. 640 Indeed, in this case, unparking remotely means the unparked \at is less likely to suffer a cache miss on the array, which leaves the \at data structure and the remote queue as the only source of likely cache misses. 641 Results show both are amortized fairly well in this case. 642 \CFA and Tokio both \unpark locally and as a result, suffer a marginal performance degradation from the cache miss on the array. 643 644 Looking at the results for the AMD architecture, Figure~\ref{fig:locality:nasus}, shows results similar to the Intel. 645 Again the overall performance is higher and slightly more variation is visible. 646 Looking at the left column first, Figures~\ref{fig:locality:nasus:share:ops} and \ref{fig:locality:nasus:share:ns}, \CFA and Tokio still outperform libfibre, this time more significantly. 647 This advantage is expected from the AMD server with its smaller and narrower caches that magnify the costs of processing the array. 648 Go still has the same poor performance as on Intel. 649 650 Finally looking at the right column, Figures~\ref{fig:locality:nasus:noshare:ops} and \ref{fig:locality:nasus:noshare:ns}, like on Intel, the same performance inversion is present between libfibre and \CFA/Tokio. 651 Go still has the same poor performance. 652 653 Overall, this benchmark mostly demonstrates the two options available when \glslink{atsched}{unparking} a \at. 654 Depending on the workload, either of these options can be the appropriate one. 655 Since it is prohibitively difficult to dynamically detect which approach is appropriate, all runtimes much choose one of the two and live with the consequences. 656 657 Once again, these experiments demonstrate that \CFA achieves equivalent performance to the other runtimes, in this case matching the faster Tokio rather than Go, which is trailing behind. 594 658 595 659 \section{Transfer} 596 660 The last benchmark is more of an experiment than a benchmark. 597 661 It tests the behaviour of the schedulers for a misbehaved workload. 598 In this workload, one of the\at is selected at random to be the leader.662 In this workload, one \at is selected at random to be the leader. 599 663 The leader then spins in a tight loop until it has observed that all other \ats have acknowledged its leadership. 600 664 The leader \at then picks a new \at to be the next leader and the cycle repeats. 601 The benchmark comes in two flavours for the non-leader \ats:665 The benchmark comes in two variations for the non-leader \ats: 602 666 once they acknowledged the leader, they either block on a semaphore or spin yielding. 603 604 The experiment is designed to evaluate the short-term load-balancing of a scheduler. 605 Indeed, schedulers where the runnable \ats are partitioned on the \procs may need to balance the \ats for this experiment to terminate. 606 This problem occurs because the spinning \at is effectively preventing the \proc from running any other \at. 607 In the semaphore flavour, the number of runnable \ats eventually dwindles down to only the leader. 608 This scenario is a simpler case to handle for schedulers since \procs eventually run out of work. 609 In the yielding flavour, the number of runnable \ats stays constant. 610 This scenario is a harder case to handle because corrective measures must be taken even when work is available. 611 Note, runtime systems with preemption circumvent this problem by forcing the spinner to yield. 612 613 In both flavours, the experiment effectively measures how long it takes for all \ats to run once after a given synchronization point. 614 In an ideal scenario where the scheduler is strictly FIFO, every thread would run once after the synchronization and therefore the delay between leaders would be given by: 615 $ \frac{CSL + SL}{NP - 1}$, where $CSL$ is the context switch latency, $SL$ is the cost for enqueueing and dequeuing a \at and $NP$ is the number of \procs. 616 However, if the scheduler allows \ats to run many times before other \ats are able to run once, this delay will increase. 617 The semaphore version is an approximation of the strictly FIFO scheduling, where none of the \ats \emph{attempt} to run more than once. 618 The benchmark effectively provides the fairness guarantee in this case. 619 In the yielding version however, the benchmark provides no such guarantee, which means the scheduler has full responsibility and any unfairness will be measurable. 620 621 While this is a fairly artificial scenario, it requires only a few simple pieces. 622 The yielding version of this simply creates a scenario where a \at runs uninterrupted in a saturated system, and starvation has an easily measured impact. 623 However, \emph{any} \at that runs uninterrupted for a significant period of time in a saturated system could lead to this kind of starvation. 667 Figure~\ref{fig:transfer:code} shows pseudo code for this benchmark. 624 668 625 669 \begin{figure} … … 641 685 // pick next leader 642 686 leader := threads[ prng() % len(threads) ] 643 // wake every 687 // wake everyone 644 688 if ! exhaust { 645 689 for t in threads { … … 660 704 } 661 705 \end{cfa} 662 \caption[Transfer Benchmark : Pseudo Code]{Transfer Benchmark: Pseudo Code}706 \caption[Transfer Benchmark: Pseudo Code]{Transfer Benchmark: Pseudo Code} 663 707 \label{fig:transfer:code} 664 708 \end{figure} 665 709 710 The experiment is designed to evaluate the short-term load balancing of a scheduler. 711 Indeed, schedulers where the runnable \ats are partitioned on the \procs may need to balance the \ats for this experiment to terminate. 712 This problem occurs because the spinning \at is effectively preventing the \proc from running any other \at. 713 In the semaphore variation, the number of runnable \ats eventually dwindles to only the leader. 714 This scenario is a simpler case to handle for schedulers since \procs eventually run out of work. 715 In the yielding variation, the number of runnable \ats stays constant. 716 This scenario is a harder case to handle because corrective measures must be taken even when work is available. 717 Note that runtimes with preemption circumvent this problem by forcing the spinner to yield. 718 In \CFA preemption was disabled as it only obfuscates the results. 719 I am not aware of a method to disable preemption in Go. 720 721 In both variations, the experiment effectively measures how long it takes for all \ats to run once after a given synchronization point. 722 In an ideal scenario where the scheduler is strictly FIFO, every thread would run once after the synchronization and therefore the delay between leaders would be given by, $(CSL + SL) / (NP - 1)$, 723 where $CSL$ is the context-switch latency, $SL$ is the cost for enqueueing and dequeuing a \at, and $NP$ is the number of \procs. 724 However, if the scheduler allows \ats to run many times before other \ats can run once, this delay increases. 725 The semaphore version is an approximation of strictly FIFO scheduling, where none of the \ats \emph{attempt} to run more than once. 726 The benchmark effectively provides the fairness guarantee in this case. 727 In the yielding version, however, the benchmark provides no such guarantee, which means the scheduler has full responsibility and any unfairness is measurable. 728 729 While this is an artificial scenario, in real life it requires only a few simple pieces. 730 The yielding version simply creates a scenario where a \at runs uninterrupted in a saturated system and the starvation has an easily measured impact. 731 Hence, \emph{any} \at that runs uninterrupted for a significant time in a saturated system could lead to this kind of starvation. 732 666 733 \subsection{Results} 667 \begin{figure} 734 735 \begin{table} 736 \caption[Transfer Benchmark on Intel and AMD]{Transfer Benchmark on Intel and AMD\smallskip\newline Average measurement of how long it takes for all \ats to acknowledge the leader \at. 737 DNC stands for ``did not complete'', meaning that after 5 seconds of a new leader being decided, some \ats still had not acknowledged the new leader.} 738 \label{fig:transfer:res} 739 \setlength{\extrarowheight}{2pt} 740 \setlength{\tabcolsep}{5pt} 668 741 \begin{centering} 669 \begin{tabular}{r | c c c c | c c c c } 670 Machine & \multicolumn{4}{c |}{Intel} & \multicolumn{4}{c}{AMD} \\ 671 Variation & \multicolumn{2}{c}{Park} & \multicolumn{2}{c |}{Yield} & \multicolumn{2}{c}{Park} & \multicolumn{2}{c}{Yield} \\ 742 \begin{tabular}{r | c | c | c | c | c | c | c | c} 743 Machine & \multicolumn{4}{c |}{Intel} & \multicolumn{4}{c}{AMD} \\ 744 \cline{2-9} 745 Variation & \multicolumn{2}{c|}{Park} & \multicolumn{2}{c |}{Yield} & \multicolumn{2}{c|}{Park} & \multicolumn{2}{c}{Yield} \\ 746 \cline{2-9} 672 747 \procs & 2 & 192 & 2 & 192 & 2 & 256 & 2 & 256 \\ 673 748 \hline … … 678 753 \end{tabular} 679 754 \end{centering} 680 \caption[Transfer Benchmark on Intel and AMD]{Transfer Benchmark on Intel and AMD\smallskip\newline Average measurement of how long it takes for all \ats to acknowledge the leader \at. DNC stands for ``did not complete'', meaning that after 5 seconds of a new leader being decided, some \ats still had not acknowledged the new leader.} 681 \label{fig:transfer:res} 682 \end{figure} 683 684 Figure~\ref{fig:transfer:res} shows the result for the transfer benchmark with 2 \procs and all \procs, where each experiment runs 100 \at per \proc. 755 \end{table} 756 757 Table~\ref{fig:transfer:res} shows the result for the transfer benchmark with 2 \procs and all \procs on the computer, where each experiment runs 100 \ats per \proc. 685 758 Note that the results here are only meaningful as a coarse measurement of fairness, beyond which small cost differences in the runtime and concurrent primitives begin to matter. 686 As such, data points that are the on the same order of magnitude as each other should be basicallyconsidered equal.687 Th e takeaway of this experiment is the presence of very large differences.688 The semaphore variation is denoted ``Park'', where the number of \ats dwindles downas the new leader is acknowledged.759 As such, data points within the same order of magnitude are considered equal. 760 That is, the takeaway of this experiment is the presence of very large differences. 761 The semaphore variation is denoted ``Park'', where the number of \ats dwindles as the new leader is acknowledged. 689 762 The yielding variation is denoted ``Yield''. 690 The experiment was only run for the extremes of the number of cores since the scaling per core behaves like previous experiments. 691 This experiments clearly demonstrate that while the other runtimes achieve similar performance in previous benchmarks, here \CFA achieves significantly better fairness. 692 The semaphore variation serves as a control group, where all runtimes are expected to transfer leadership fairly quickly. 693 Since \ats block after acknowledging the leader, this experiment effectively measures how quickly \procs can steal \ats from the \proc running leader. 694 Figure~\ref{fig:transfer:res} shows that while Go and Tokio are slower, all runtime achieve decent latency. 763 The experiment is only run for a few and many \procs since scaling is not the focus of this experiment. 764 765 The first two columns show the results for the semaphore variation on Intel. 766 While there are some differences in latencies, \CFA is consistently the fastest and Tokio the slowest, all runtimes achieve fairly close results. 767 Again, this experiment is meant to highlight major differences so latencies within $10\times$ of each other are considered equal. 768 769 Looking at the next two columns, the results for the yield variation on Intel, the story is very different. 770 \CFA achieves better latencies, presumably due to no synchronization with the yield. 771 Go does complete the experiment, but with drastically higher latency: 772 latency at 2 \procs is $350\times$ higher than \CFA and $70\times$ higher at 192 \procs. 773 This difference is because Go has a classic work-stealing scheduler, but it adds coarse-grain preemption 774 , which interrupts the spinning leader after a period. 775 Neither Libfibre nor Tokio complete the experiment. 776 Both runtimes also use classical work-stealing scheduling without preemption, and therefore, none of the work queues are ever emptied so no load balancing occurs. 777 778 Looking now at the results for the AMD architecture, the results show effectively the same story. 779 The first two columns show all runtime obtaining results well within $10\times$ of each other. 780 The next two columns again show \CFA producing low latencies, while Go still has notably higher latency but the difference is less drastic on 2 \procs, where it produces a $15\times$ difference as opposed to a $100\times$ difference on 256 \procs. 781 Neither Libfibre nor Tokio complete the experiment. 782 783 This experiment clearly demonstrates that \CFA achieves significantly better fairness. 784 The semaphore variation serves as a control, where all runtimes are expected to transfer leadership fairly quickly. 785 Since \ats block after acknowledging the leader, this experiment effectively measures how quickly \procs can steal \ats from the \proc running the leader. 786 Table~\ref{fig:transfer:res} shows that while Go and Tokio are slower using the semaphore, all runtimes achieve decent latency. 787 695 788 However, the yielding variation shows an entirely different picture. 696 Since libfibre and Tokio have a traditional work-stealing scheduler, \procs that have \ats on their local queues willnever steal from other \procs.697 The result is that the experiment simply does not complete for these runtime .698 Without \procs stealing from the \proc running the leader, the experiment will simply neverterminate.789 Since libfibre and Tokio have a traditional work-stealing scheduler, \procs that have \ats on their local queues never steal from other \procs. 790 The result is that the experiment simply does not complete for these runtimes. 791 Without \procs stealing from the \proc running the leader, the experiment cannot terminate. 699 792 Go manages to complete the experiment because it adds preemption on top of classic work-stealing. 700 However, since preemption is fairly costlyit achieves significantly worst performance.793 However, since preemption is fairly infrequent, it achieves significantly worst performance. 701 794 In contrast, \CFA achieves equivalent performance in both variations, demonstrating very good fairness. 702 Interestingly \CFA achieves better delays in the yielding version than the semaphore version, however, that is likely due to fairness being equivalent but removing the cost of the semaphores and idle -sleep.795 Interestingly \CFA achieves better delays in the yielding version than the semaphore version, however, that is likely due to fairness being equivalent but removing the cost of the semaphores and idle sleep. -
doc/theses/thierry_delisle_PhD/thesis/text/existing.tex
rebf8ca5 r23a08aa0 5 5 6 6 In general, \emph{selecting} a scheduling algorithm depends on how much information is available to the scheduler. 7 Workloads that are well -known, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms.7 Workloads that are well known, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms. 8 8 A secondary aspect is how much information can be gathered versus how much information must be given as part of the scheduler input. 9 9 This information adds to the spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limited information. 10 Note, this description includes both information about each request s, \eg time to complete or resources needed, and information about the relationships among request, \eg whether or not some requestmust be completed before another request starts.11 12 Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling , since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time.10 Note, this description includes both information about each request, \eg time to complete or resources needed, and information about the relationships among requests, \eg whether some requests must be completed before another request starts. 11 12 Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods. 13 13 When a faster pace is needed and changes are much more frequent gathering information on workloads, up-front or live, can become much more limiting and more general schedulers are needed. 14 14 15 15 \section{Naming Convention} 16 Scheduling has been studied by various communities concentrating on different incarnation of the same problems.17 As a result, there are no standard naming conventions for scheduling that isrespected across these communities.16 Scheduling has been studied by various communities concentrating on different incarnations of the same problems. 17 As a result, there are no standard naming conventions for scheduling that are respected across these communities. 18 18 This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \ats. 19 19 20 20 \section{Static Scheduling} 21 \newterm{Static schedulers} require \ats dependencies and costs be explicitly and exhaustively specified prior to scheduling.21 \newterm{Static schedulers} require \ats dependencies and costs to be explicitly and exhaustively specified prior to scheduling. 22 22 The scheduler then processes this input ahead of time and produces a \newterm{schedule} the system follows during execution. 23 23 This approach is popular in real-time systems since the need for strong guarantees justifies the cost of determining and supplying this information. 24 In general, static schedulers are less relevant to this project because they require input from the programmers that the programming language does not have as part of its concurrency semantic.24 In general, static schedulers are less relevant to this project because they require input from the programmers that the \CFA programming language does not have as part of its concurrency semantics. 25 25 Specifying this information explicitly adds a significant burden to the programmer and reduces flexibility. 26 26 For this reason, the \CFA scheduler does not require this information. 27 27 28 28 \section{Dynamic Scheduling} 29 \newterm{Dynamic schedulers} determine \at sdependencies and costs during scheduling, if at all.30 Hence, unlike static scheduling, \at sdependencies are conditional and detected at runtime.31 This detection takes the form of observing new \ats (s) in the system and determining dependencies from their behaviour, including suspending or halting a \atsthat dynamically detects unfulfilled dependencies.32 Furthermore, each \at shas the responsibility of adding dependent \ats back into the system once dependencies are fulfilled.29 \newterm{Dynamic schedulers} determine \at dependencies and costs during scheduling, if at all. 30 Hence, unlike static scheduling, \at dependencies are conditional and detected at runtime. 31 This detection takes the form of observing new \ats in the system and determining dependencies from their behaviour, including suspending or halting a \at that dynamically detects unfulfilled dependencies. 32 Furthermore, each \at has the responsibility of adding dependent \ats back into the system once dependencies are fulfilled. 33 33 As a consequence, the scheduler often has an incomplete view of the system, seeing only \ats with no pending dependencies. 34 34 35 35 \subsection{Explicitly Informed Dynamic Schedulers} 36 While dynamic schedulers may not have an exhaustive list of dependencies for a \ats, some information may be available about each \ats, \eg expected duration, required resources, relative importance, \etc. 37 When available, a scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information} 38 However, most programmers do not determine or even \emph{predict} this information; 39 at best, the scheduler has only some imprecise information provided by the programmer, \eg, indicating a \ats takes approximately 3--7 seconds to complete, rather than exactly 5 seconds. 40 Providing this kind of information is a significant programmer burden especially if the information does not scale with the number of \ats and their complexity. 41 For example, providing an exhaustive list of files read by 5 \ats is an easier requirement then providing an exhaustive list of memory addresses accessed by 10,000 independent \ats. 36 While dynamic schedulers may not have an exhaustive list of dependencies for a \at, some information may be available about each \at, \eg expected duration, required resources, relative importance, \etc. 37 When available, a scheduler can then use this information to direct the scheduling decisions. 38 For example, when scheduling in a cloud computing context, \ats will commonly have extra information that was manually entered, \eg caps on compute time or \io usage. 39 However, in the context of user-level threading, most programmers do not determine or even \emph{predict} this information; 40 at best, the scheduler has only some imprecise information provided by the programmer, \eg, indicating a \at takes approximately 3--7 seconds to complete, rather than exactly 5 seconds. 41 Providing this kind of information is a significant programmer burden, especially if the information does not scale with the number of \ats and their complexity. 42 For example, providing an exhaustive list of files read by 5 \ats is an easier requirement than providing an exhaustive list of memory addresses accessed by 10,000 independent \ats. 42 43 43 44 Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, explicitly informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentioning. … … 45 46 \subsubsection{Priority Scheduling} 46 47 Common information used by schedulers to direct their algorithm is priorities. 47 Each \at s is given a priorityand higher-priority \ats are preferred to lower-priority ones.48 The simplest priority scheduling algorithm is to require that every \at shave a distinct pre-established priority and always run the available \ats with the highest priority.48 Each \at is given a priority, and higher-priority \ats are preferred to lower-priority ones. 49 The simplest priority scheduling algorithm is to require that every \at have a distinct pre-established priority and always run the available \ats with the highest priority. 49 50 Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \ats. 50 It can therefore be desirable for schedulers to support \ats with identical priorities and/or automatically set ting and adjustingpriorities for \ats.51 It can therefore be desirable for schedulers to support \ats with identical priorities and/or automatically set and adjust priorities for \ats. 51 52 Most common operating systems use some variant on priorities with overlaps and dynamic priority adjustments. 52 53 For example, Microsoft Windows uses a pair of priorities~\cite{win:priority}, one specified by users out of ten possible options and one adjusted by the system. 53 54 54 55 \subsection{Uninformed and Self-Informed Dynamic Schedulers} 55 Several scheduling algorithms do not require programmers to provide additional information on each \at s, and insteadmake scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.56 Several scheduling algorithms do not require programmers to provide additional information on each \at, and instead, make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler. 56 57 57 58 58 59 \subsubsection{Feedback Scheduling} 59 As mentioned, schedulers may also gather information about each \at sto direct their decisions.60 As mentioned, schedulers may also gather information about each \at to direct their decisions. 60 61 This design effectively moves the scheduler into the realm of \newterm{Control Theory}~\cite{wiki:controltheory}. 61 62 This information gathering does not generally involve programmers, and as such, does not increase programmer burden the same way explicitly provided information may. 62 However, some feedback schedulers do allow programmers to offer additional information on certain \ats, in orderto direct scheduling decisions.63 The important distinction being whether or notthe scheduler can function without this additional information.63 However, some feedback schedulers do allow programmers to offer additional information on certain \ats, to direct scheduling decisions. 64 The important distinction is whether the scheduler can function without this additional information. 64 65 65 66 66 67 \section{Work Stealing}\label{existing:workstealing} 67 One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work stealing.68 This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \ats first,but allows the possibility for other workers to steal local \ats if they run out of \ats.69 \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each worker s has a queue of \ats and workers without \ats steal \ats from random workers\footnote{The Burton and Sleep algorithm had trees of \ats and stealonly among neighbours.}.70 Blumofe and Leiserson also prove worst 68 One of the most popular scheduling algorithms in practice (see~\ref{existing:prod}) is work stealing. 69 This idea, introduced by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \ats first but allows the possibility for other workers to steal local \ats if they run out of \ats. 70 \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each worker has a queue of \ats and workers without \ats steal \ats from random workers\footnote{The Burton and Sleep algorithm has trees of \ats and steals only among neighbours.}. 71 Blumofe and Leiserson also prove worst-case space and time requirements for well-structured computations. 71 72 72 73 Many variations of this algorithm have been proposed over the years~\cite{DBLP:journals/ijpp/YangH18}, both optimizations of existing implementations and approaches that account for new metrics. … … 76 77 In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead. 77 78 The best performance generally means finding a middle ground between the two. 78 Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained. 79 80 \paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating \ats from one core to another can be . \cite{DBLP:journals/tpds/SquillanteL93} 81 82 \todo{The survey is not great on this subject} 83 84 \paragraph{Complex Machine Architecture} Another aspect that has been examined is how well work stealing is applicable to different machine architectures. 79 Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse-grained. 80 81 \paragraph{Task Placement} Another aspect of work stealing that has been studied extensively is the mapping between \at and \proc. 82 In its simplest form, work stealing assumes that all \procs are interchangeable and therefore the mapping between \at and \proc is not interesting. 83 However, in real-life architectures there are contexts where different \procs can have different characteristics, which makes some mapping more interesting than others. 84 A common example where this is statically true is architectures with \glsxtrshort{numa}. 85 In these cases, it can be relevant to change the scheduler to be cognizant of the topology~\cite{vikranth2013topology,min2011hierarchical}. 86 Another example is energy usage, where the scheduler is modified to optimize for energy efficiency in addition/instead of performance~\cite{ribic2014energy,torng2016asymmetry}. 87 88 \paragraph{Complex Machine Architecture} Another aspect that has been examined is how applicable work stealing is to different machine architectures. 89 This is arguably strongly related to Task Placement but extends into more heterogeneous architectures. 90 As \CFA offers no particular support for heterogeneous architecture, this is also an area that is less relevant to this thesis. 91 Although it could be an interesting avenue for future work. 85 92 86 93 \subsection{Theoretical Results} 87 There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}.94 There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of \glslink{atmig}{migration}~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}. 88 95 \cite{DBLP:journals/jacm/BlellochGM99} examines the space bounds of work stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} shows that for under-loaded systems, the scheduler completes its computations in finite time, \ie is \newterm{stable}. 89 Others show that work stealing is applicableto various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}.96 Others show that work stealing applies to various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}. 90 97 \cite{DBLP:conf/ipps/ColeR13} also studied how randomized work-stealing affects false sharing among \ats. 91 98 92 However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully -strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph.93 It is unclear how well these distributions represent workloads in real 99 However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph. 100 It is unclear how well these distributions represent workloads in real-world scenarios. 94 101 95 102 \section{Preemption} 96 103 One last aspect of scheduling is preemption since many schedulers rely on it for some of their guarantees. 97 104 Preemption is the idea of interrupting \ats that have been running too long, effectively injecting suspend points into the application. 98 There are multiple techniques to achieve this effect but they all aim to guarantee that the suspend points in a \atsare never further apart than some fixed duration.99 While this helps schedulers guarantee that no \ats unfairly monopolize sa worker, preemption can effectively be added to any scheduler.100 Therefore, the only interesting aspect of preemption for the design of scheduling is whether or notto require it.105 There are multiple techniques to achieve this effect, but they all aim to guarantee that the suspend points in a \at are never further apart than some fixed duration. 106 While this helps schedulers guarantee that no \ats unfairly monopolize a worker, preemption can effectively be added to any scheduler. 107 Therefore, the only interesting aspect of preemption for the design of scheduling is whether to require it. 101 108 102 109 \section{Production Schedulers}\label{existing:prod} … … 104 111 While these schedulers do not necessarily represent the most recent advances in scheduling, they are what is generally accessible to programmers. 105 112 As such, I believe these schedulers are at least as relevant as those presented in published work. 106 Schedulers that operate in kernel space and inuser space are considered, as both can offer relevant insight for this project.113 Both Schedulers that operate in kernel space and user space are considered, as both can offer relevant insight for this project. 107 114 However, real-time schedulers are not considered, as these have constraints that are much stricter than what is needed for this project. 108 115 109 116 \subsection{Operating System Schedulers} 110 Operating System Schedulers tend to be fairly complex as they generally support some amount of real -time, aim to balance interactive and non-interactive \ats and support multiple users sharing hardware without requiring these users to cooperate.117 Operating System Schedulers tend to be fairly complex as they generally support some amount of real time, aim to balance interactive and non-interactive \ats and support multiple users sharing hardware without requiring these users to cooperate. 111 118 Here are more details on a few schedulers used in the common operating systems: Linux, FreeBSD, Microsoft Windows and Apple's OS X. 112 The information is less complete for operating systems with closed source.119 The information is less complete for closed source operating systems. 113 120 114 121 \paragraph{Linux's CFS} 115 122 The default scheduler used by Linux, the Completely Fair Scheduler~\cite{MAN:linux/cfs,MAN:linux/cfs2}, is a feedback scheduler based on CPU time. 116 123 For each processor, it constructs a Red-Black tree of \ats waiting to run, ordering them by the amount of CPU time used. 117 The \at sthat has used the least CPU time is scheduled.124 The \at that has used the least CPU time is scheduled. 118 125 It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time used. 119 The ordering of \ats is also affected by a group 120 Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the load, currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}.121 122 \cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly.123 The issues highlighted stem from Linux's need to support fairness across \ats \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \at s and the other with one thousand \ats, the user with a single \ats does not receive onethousandth of the CPU time.}, increasing the complexity.124 125 Linux also offers a FIFO scheduler, a real-time scheduler, which runs the highest-priority \ats, and a round-robin scheduler, which is an extension of the FIFO -scheduler that adds fixed time slices. \cite{MAN:linux/sched}126 The ordering of \ats is also affected by a group-based notion of fairness, where \ats belonging to groups having used less CPU time are preferred to \ats belonging to groups having used more CPU time. 127 Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the \gls{load}, currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}. 128 129 \cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each \proc, but the paper argues this aspect can be improved significantly. 130 The issues highlighted stem from Linux's need to support fairness across \ats \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \at and the other with one thousand \ats, the user with a single \at does not receive one-thousandth of the CPU time.}, increasing the complexity. 131 132 Linux also offers a FIFO scheduler, a real-time scheduler, which runs the highest-priority \ats, and a round-robin scheduler, which is an extension of the FIFO scheduler that adds fixed time slices. \cite{MAN:linux/sched} 126 133 127 134 \paragraph{FreeBSD} 128 135 The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS. 129 136 It uses different data structures and heuristics but also schedules according to some combination of CPU time used and niceness values. 130 It also periodically balances the load of the system (according to a different heuristic) ,but uses a simpler work stealing approach.137 It also periodically balances the load of the system (according to a different heuristic) but uses a simpler work stealing approach. 131 138 132 139 \paragraph{Windows(OS)} 133 140 Microsoft's Operating System's Scheduler~\cite{MAN:windows/scheduler} is a feedback scheduler with priorities. 134 141 It supports 32 levels of priorities, some of which are reserved for real-time and privileged applications. 135 It schedules \ats based on the highest priorities (lowest number) and how much CPU time each \at shas used.142 It schedules \ats based on the highest priorities (lowest number) and how much CPU time each \at has used. 136 143 The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests. 137 144 138 In~\cite{russinovich2009windows}, Chapter 1 section ``Processes, Threads, and Jobs''\todo{Look up section number.} discusses the scheduling policy more indepth.139 Multicore scheduling is based on a combination of priorities and preferred \proc.145 In~\cite{russinovich2009windows}, Chapter 1 section 2.3 ``Processes, Threads, and Jobs'' discusses the scheduling policy more in-depth. 146 Multicore scheduling is based on a combination of priorities and \proc preference. 140 147 Each \at is assigned an initial processor using a round-robin policy, called the \at's \newterm{ideal} \proc. 141 148 \Glspl{at} are distributed among the \procs according to their priority, preferring to match \ats to their ideal \proc and then to the last \proc they ran on. 142 This approach is a variation of work stealing, where the stealing \proc restore the \at to its original \proc after running it, but mixed with priorities.149 This approach is a variation of work stealing, where the stealing \proc restores the \at to its original \proc after running it, but mixed with priorities. 143 150 144 151 \paragraph{Apple OS X} … … 152 159 \end{displayquote} 153 160 154 \todo{load balancing} 161 There is very little documentation on the internals of this scheduler. 162 However, the documentation does describe a feature set that is very similar to the Windows and Linux OS schedulers. 163 Presumably, this means that the internals are also fairly similar overall. 155 164 156 165 \subsection{User-Level Schedulers} 157 By comparison, user level schedulers tend to be simpler, gatheringfewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \ats have the same user, and therefore cooperation is both feasible and probable.166 By comparison, user-level schedulers tend to be simpler, gather fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \ats have the same user, and therefore cooperation is both feasible and probable. 158 167 159 168 \paragraph{Go}\label{GoSafePoint} 160 169 Go's scheduler uses a randomized work-stealing algorithm that has a global run-queue (\emph{GRQ}) and each processor (\emph{P}) has both a fixed-size run-queue (\emph{LRQ}) and a high-priority next ``chair'' holding a single element~\cite{GITHUB:go,YTUBE:go}. 161 Preemption is present, but only at safe -points,~\cite{go:safepoints} which are inserted detection codeat various frequent access boundaries.170 Preemption is present, but only at safe points,~\cite{go:safepoints} which are detection code inserted at various frequent access boundaries. 162 171 163 172 The algorithm is as follows : … … 175 184 Erlang is a functional language that supports concurrency in the form of processes: threads that share no data. 176 185 It uses a kind of round-robin scheduler, with a mix of work sharing and stealing to achieve load balancing~\cite{:erlang}, where under-loaded workers steal from other workers, but overloaded workers also push work to other workers. 177 This migration logic is directed by monitoring logic that evaluates the load a few times per seconds.186 This \glslink{atmig}{migration} logic is directed by monitoring logic that evaluates the load a few times per second. 178 187 179 188 \paragraph{Intel\textregistered ~Threading Building Blocks} 180 189 \newterm{Thread Building Blocks} (TBB) is Intel's task parallelism \cite{wiki:taskparallel} framework. 181 It runs \newterm{jobs}, which are uninterrupt able \ats that must always run to completion, on a pool of worker threads.190 It runs \newterm{jobs}, which are uninterruptible \ats that must always run to completion, on a pool of worker threads. 182 191 TBB's scheduler is a variation of randomized work-stealing that also supports higher-priority graph-like dependencies~\cite{MAN:tbb/scheduler}. 183 It schedules \ats as follows (where \textit{t} is the last \at scompleted):192 It schedules \ats as follows (where \textit{t} is the last \at completed): 184 193 \begin{displayquote} 185 194 \begin{enumerate} 186 195 \item The task returned by \textit{t}@.execute()@ 187 196 \item The successor of t if \textit{t} was its last completed predecessor. 188 \item A task popped from the end of the thread's own deque.189 \item A task with a ffinity for the thread.197 \item A task popped from the end of the thread's own queue. 198 \item A task with an affinity for the thread. 190 199 \item A task popped from approximately the beginning of the shared queue. 191 \item A task popped from the beginning of another randomly chosen thread's deque.200 \item A task popped from the beginning of another randomly chosen thread's queue. 192 201 \end{enumerate} 193 202 … … 208 217 While the documentation only gives limited insight into the scheduling and load balancing approach, \cite{apple:gcd2} suggests a fairly classic approach. 209 218 Each \proc has a queue of \ats to run, called \newterm{blocks}, which are drained in \glsxtrshort{fifo}. 210 \todo{update: They seem to add the concept of dependent queues with clear ordering, where executing a block ends-up scheduling more blocks. 211 In terms of semantics, these Dispatch Queues seem to be very similar to Intel\textregistered ~TBB \lstinline{execute()} and predecessor semantics.} 212 219 GCD also has secondary queues, called \newterm{Dispatch Queues}, with clear ordering, where executing a block ends up scheduling more blocks. 220 In terms of semantics, these Dispatch Queues seem to be very similar to Intel\textregistered ~TBB \lstinline{execute()} and predecessor semantics. 221 222 The similarity of API and semantics between GCD and Intel\textregistered ~TBB suggest the underlying scheduling algorithms are similar. 213 223 214 224 \paragraph{LibFibre} 215 LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a light -weight user-level threading framework developed at the University of Waterloo.216 Similarly to Go, it uses a variation of work stealing with a global queue that ishigher priority than stealing.225 LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a lightweight user-level threading framework developed at the University of Waterloo. 226 Similarly to Go, it uses a variation of work stealing with a global queue that has a higher priority than stealing. 217 227 Unlike Go, it does not have the high-priority next ``chair'' and does not use randomized work-stealing. -
doc/theses/thierry_delisle_PhD/thesis/text/front.tex
rebf8ca5 r23a08aa0 39 39 \vspace*{2.0cm} 40 40 41 Waterloo, Ontario, Canada, 202 1\\42 43 \vspace*{1.0cm} 44 45 \copyright\ Thierry Delisle 202 1\\41 Waterloo, Ontario, Canada, 2022 \\ 42 43 \vspace*{1.0cm} 44 45 \copyright\ Thierry Delisle 2022 \\ 46 46 \end{center} 47 47 \end{titlepage} … … 60 60 \noindent 61 61 The following served on the Examining Committee for this thesis. The decision of the Examining Committee is by majority vote. 62 \todo{External Examiners} 63 \bigskip 64 65 \ noindent66 \begin{tabbing} 67 Internal-External Member: \= \kill % using longest text to define tab length68 External Examiner: \> TBD\\69 \> TBD\\62 \bigskip 63 64 \noindent 65 \begin{tabbing} 66 Internal-External Member: \= \kill % using longest text to define tab length 67 External Examiner: \> Doug Lea \\ 68 \> Professor, Computer Science Department \\ 69 \> State University of New York at Oswego \\ 70 70 \end{tabbing} 71 71 \bigskip … … 96 96 \begin{tabbing} 97 97 Internal-External Member: \= \kill % using longest text to define tab length 98 Internal-External Member: \> TBD\\99 \> TBD\\98 Internal-External Member: \> Patrick Lam \\ 99 \> Associate Professor, Department of Electrical and Computer Engineering \\ 100 100 \> University of Waterloo \\ 101 101 \end{tabbing} … … 124 124 125 125 User-Level threading (M:N) is gaining popularity over kernel-level threading (1:1) in many programming languages. 126 The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multi -core systems.126 The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multicore systems. 127 127 Indeed, over-partitioning into small work-units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities. 128 128 To manage these high levels of concurrency, the underlying runtime must efficiently schedule many user threads across a few kernel threads; 129 which begs ofthe question of how many kernel threads are needed and should the number be dynamically reevaluated.129 which begs the question of how many kernel threads are needed and should the number be dynamically reevaluated. 130 130 Furthermore, scheduling must prevent kernel threads from blocking, otherwise user-thread parallelism drops. 131 When user-threading parallelism does drop, how and when should idle kernel-threadsbe put to sleep to avoid wasting CPU resources.131 When user-threading parallelism does drop, how and when should idle \glspl{kthrd} be put to sleep to avoid wasting CPU resources. 132 132 Finally, the scheduling system must provide fairness to prevent a user thread from monopolizing a kernel thread; 133 otherwise other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.134 135 This thesis analyses multiple scheduler systems, where each system attempts to fulfill the necessaryrequirements for user-level threading.136 The predominant technique for managing high levels of concurrency is sharding the ready -queue with one queue per kernel-threadand using some form of work stealing/sharing to dynamically rebalance workload shifts.137 Preventing kernel blocking is accomplish by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.133 otherwise, other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads. 134 135 This thesis analyses multiple scheduler systems, where each system attempts to fulfill the requirements for user-level threading. 136 The predominant technique for managing high levels of concurrency is sharding the ready queue with one queue per \gls{kthrd} and using some form of work stealing/sharing to dynamically rebalance workload shifts. 137 Preventing kernel blocking is accomplished by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking. 138 138 Fairness is handled through preemption and/or ad-hoc solutions, which leads to coarse-grained fairness with some pathological cases. 139 139 … … 146 146 The new scheduler also includes support for implicit nonblocking \io, allowing applications to have more user-threads blocking on \io operations than there are \glspl{kthrd}. 147 147 The implementation is based on @io_uring@, a recent addition to the Linux kernel, and achieves the same performance and fairness as systems using @select@, @epoll@, \etc. 148 To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside ofthe application.148 To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside the application. 149 149 150 150 \cleardoublepage … … 179 179 \phantomsection % allows hyperref to link to the correct page 180 180 181 % L I S T O F F I G U R E S 182 % ----------------------------- 183 \addcontentsline{toc}{chapter}{List of Figures} 184 \listoffigures 185 \cleardoublepage 186 \phantomsection % allows hyperref to link to the correct page 187 181 188 % L I S T O F T A B L E S 182 189 % --------------------------- … … 186 193 \phantomsection % allows hyperref to link to the correct page 187 194 188 % L I S T O F F I G U R E S189 % -----------------------------190 \addcontentsline{toc}{chapter}{List of Figures}191 \listoffigures192 \cleardoublepage193 \phantomsection % allows hyperref to link to the correct page194 195 195 % GLOSSARIES (Lists of definitions, abbreviations, symbols, etc. provided by the glossaries-extra package) 196 196 % ----------------------------- … … 199 199 \phantomsection % allows hyperref to link to the correct page 200 200 201 % TODOs and missing citations202 % -----------------------------203 \listofcits204 \listoftodos205 \cleardoublepage206 \phantomsection % allows hyperref to link to the correct page207 208 209 201 % Change page numbering back to Arabic numerals 210 202 \pagenumbering{arabic} -
doc/theses/thierry_delisle_PhD/thesis/text/intro.tex
rebf8ca5 r23a08aa0 2 2 3 3 \Gls{uthrding} (M:N) is gaining popularity over kernel-level threading (1:1) in many programming languages. 4 The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multi -core systems.5 Indeed, over-partitioning into small work -units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities.4 The user threading approach is often a better mechanism to express complex concurrent applications by efficiently running 10,000+ threads on multicore systems. 5 Indeed, over-partitioning into small work units with user threading significantly eases load bal\-ancing, while simultaneously providing advanced synchronization and mutual exclusion capabilities. 6 6 To manage these high levels of concurrency, the underlying runtime must efficiently schedule many user threads across a few kernel threads; 7 which begs ofthe question of how many kernel threads are needed and should the number be dynamically reevaluated.7 which begs the question of how many kernel threads are needed and should the number be dynamically reevaluated. 8 8 Furthermore, scheduling must prevent kernel threads from blocking, otherwise user-thread parallelism drops. 9 9 When user-threading parallelism does drop, how and when should idle kernel-threads be put to sleep to avoid wasting CPU resources. 10 10 Finally, the scheduling system must provide fairness to prevent a user thread from monopolizing a kernel thread; 11 otherwise other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads.11 otherwise, other user threads can experience short/long term starvation or kernel threads can deadlock waiting for events to occur on busy kernel threads. 12 12 13 This thesis analy ses multiple scheduler systems, where each system attempts to fulfill the necessaryrequirements for \gls{uthrding}.14 The predominant technique for managing high levels of concurrency is sharding the ready -queue with one queue per kernel-thread and using some form of work stealing/sharing to dynamically rebalance workload shifts.15 Preventing kernel blocking is accomplish by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking.16 Fairness is handled through preemption and/or ad -hoc solutions, which leads to coarse-grained fairness with some pathological cases.13 This thesis analyzes multiple scheduler systems, where each system attempts to fulfill the requirements for \gls{uthrding}. 14 The predominant technique for managing high levels of concurrency is sharding the ready queue with one queue per kernel thread and using some form of work stealing/sharing to dynamically rebalance workload shifts. 15 Preventing kernel blocking is accomplished by transforming kernel locks and I/O operations into user-level operations that do not block the kernel thread or spin up new kernel threads to manage the blocking. 16 Fairness is handled through preemption and/or ad hoc solutions, which leads to coarse-grained fairness with some pathological cases. 17 17 18 After examining, testing and selecting specific approaches to these scheduling issues, a completely new scheduler was created and tested in the \CFA (C-for-all) user-threading runtime -system.18 After examining, testing and selecting specific approaches to these scheduling issues, a completely new scheduler was created and tested in the \CFA (C-for-all) user-threading runtime system. 19 19 The goal of the new scheduler is to offer increased safety and productivity without sacrificing performance. 20 The quality of the new scheduler is demonstrated by comparing it with other user-threading work-stealing schedulers with the aim of showing equivalent or better performance while offering better fairness.20 The quality of the new scheduler is demonstrated by comparing it with other user-threading work-stealing schedulers with, the aim of showing equivalent or better performance while offering better fairness. 21 21 22 22 Chapter~\ref{intro} defines scheduling and its general goals. 23 23 Chapter~\ref{existing} discusses how scheduler implementations attempt to achieve these goals, but all implementations optimize some workloads better than others. 24 Chapter~\ref{cfaruntime} presents the relevant aspects of the \CFA runtime system that have a significant affect on the new scheduler design and implementation.25 Chapter~\ref{core} analyses different scheduler approaches ,while looking for scheduler mechanisms that provide both performance and fairness.24 Chapter~\ref{cfaruntime} presents the relevant aspects of the \CFA runtime system that have a significant effect on the new scheduler design and implementation. 25 Chapter~\ref{core} analyses different scheduler approaches while looking for scheduler mechanisms that provide both performance and fairness. 26 26 Chapter~\ref{userio} covers the complex mechanisms that must be used to achieve nonblocking I/O to prevent the blocking of \glspl{kthrd}. 27 27 Chapter~\ref{practice} presents the mechanisms needed to adjust the amount of parallelism, both manually and automatically. … … 29 29 30 30 31 \section{Scheduling} 31 \section{Scheduling}\label{sched} 32 32 Computer systems share multiple resources across many threads of execution, even on single-user computers like laptops or smartphones. 33 On a computer system with multiple processors and work units (routines, coroutines, threads, programs, \etc), there exists the problem of mapping many different kinds of work units onto many different kinds of processors in an efficient manner, called \newterm{scheduling}.33 On a computer system with multiple processors and work units (routines, coroutines, threads, programs, \etc), there exists the problem of mapping many different kinds of work units onto many different kinds of processors efficiently, called \newterm{scheduling}. 34 34 Scheduling systems are normally \newterm{open}, meaning new work arrives from an external source or is randomly spawned from an existing work unit. 35 35 In general, work units without threads, like routines and coroutines, are self-scheduling, while work units with threads, like tasks and programs, are scheduled. 36 36 For scheduled work-units, a scheduler takes a sequence of threads and attempts to run them to completion, subject to shared resource restrictions and utilization. 37 A general-purpose dynamic-scheduler for an open systemcannot anticipate work requests, so its performance is rarely optimal.38 Even with complete knowledge of arriv eorder and work, creating an optimal solution is a bin packing problem~\cite{wiki:binpak}.37 In an open system, a general-purpose dynamic scheduler cannot anticipate work requests, so its performance is rarely optimal. 38 Even with complete knowledge of arrival order and work, creating an optimal solution is a bin packing problem~\cite{wiki:binpak}. 39 39 However, optimal solutions are often not required: schedulers often produce excellent solutions, without needing optimality, by taking advantage of regularities in work patterns. 40 40 41 Scheduling occurs at discre etpoints when there are transitions in a system.42 For example, a threadcycles through the following transitions during its execution.41 Scheduling occurs at discrete points when there are transitions in a system. 42 For example, a \at cycles through the following transitions during its execution. 43 43 \begin{center} 44 44 \input{executionStates.pstex_t} … … 49 49 entering the system (new $\rightarrow$ ready) 50 50 \item 51 scheduler assigns a threadto a computing resource, \eg CPU (ready $\rightarrow$ running)51 scheduler assigns a \at to a computing resource, \eg CPU (ready $\rightarrow$ running) 52 52 \item 53 53 timer alarm for preemption (running $\rightarrow$ ready) 54 54 \item 55 long 55 long-term delay versus spinning (running $\rightarrow$ blocked) 56 56 \item 57 57 completion of delay, \eg network or I/O completion (blocked $\rightarrow$ ready) … … 59 59 normal completion or error, \eg segment fault (running $\rightarrow$ halted) 60 60 \end{itemize} 61 Key to scheduling is that a threadcannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system, \ie no self-scheduling among threads.61 Key to scheduling is that a \at cannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system, \ie no self-scheduling among threads. 62 62 63 63 When the workload exceeds the capacity of the processors, \ie work cannot be executed immediately, it is placed on a queue for subsequent service, called a \newterm{ready queue}. … … 71 71 \end{tabular} 72 72 \end{center} 73 Beyond these two schedulers are a host of options, \eg adding a n global shared queue to MQMS or adding multiple private queues with distinccharacteristics.73 Beyond these two schedulers are a host of options, \eg adding a global shared queue to MQMS or adding multiple private queues with distinct characteristics. 74 74 75 75 Once there are multiple resources and ready queues, a scheduler is faced with three major optimization criteria: … … 84 84 85 85 \noindent 86 Essentially, all multi -processor computers have non-uniform memory access (NUMA), with one or more quantized steps to access data at different levels in the memory hierarchy.86 Essentially, all multiprocessor computers have non-uniform memory access (NUMA), with one or more quantized steps to access data at different levels in the memory hierarchy. 87 87 When a system has a large number of independently executing threads, affinity becomes difficult because of \newterm{thread churn}. 88 That is, threads must be scheduled on different processors to obtain high processor sutilization because the number of threads $\ggg$ processors.88 That is, threads must be scheduled on different processors to obtain high processor utilization because the number of threads $\ggg$ processors. 89 89 90 90 \item 91 91 \newterm{contention}: safe access of shared objects by multiple processors requires mutual exclusion in some form, generally locking.\footnote{ 92 92 Lock-free data-structures do not involve locking but incur similar costs to achieve mutual exclusion.} 93 Mutual exclusion cost and latency increase ssignificantly with the number of processors access\-ing a shared object.93 Mutual exclusion cost and latency increase significantly with the number of processors access\-ing a shared object. 94 94 \end{enumerate} 95 95 … … 116 116 117 117 Since \CFA attempts to improve the safety and productivity of C, the new scheduler presented in this thesis attempts to achieve the same goals. 118 More specifically, safety and productivity for scheduling mean ssupporting a wide range of workloads so that programmers can rely on progress guarantees (safety) and more easily achieve acceptable performance (productivity).118 More specifically, safety and productivity for scheduling mean supporting a wide range of workloads so that programmers can rely on progress guarantees (safety) and more easily achieve acceptable performance (productivity). 119 119 The new scheduler also includes support for implicit nonblocking \io, allowing applications to have more user-threads blocking on \io operations than there are \glspl{kthrd}. 120 To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside ofthe application.120 To complete the scheduler, an idle sleep mechanism is implemented that significantly reduces wasted CPU cycles, which are then available outside the application. 121 121 122 As a research project, this work builds exclusively on newer versions of the Linux operating -system and gcc/clang compilers.122 As a research project, this work builds exclusively on newer versions of the Linux operating system and gcc/clang compilers. 123 123 The new scheduler implementation uses several optimizations to successfully balance the cost of fairness against performance; 124 124 some of these optimizations rely on interesting hardware optimizations only present on modern CPUs. 125 The \io implementation is based on the @io_uring@ kernel -interface, a recent addition to the Linux kernel, because it purports to handle nonblocking \emph{file} and network \io.125 The \io implementation is based on the @io_uring@ kernel interface, a recent addition to the Linux kernel, because it purports to handle nonblocking \emph{file} and network \io. 126 126 This decision allowed an interesting performance and fairness comparison with other threading systems using @select@, @epoll@, \etc. 127 127 While the current \CFA release supports older versions of Linux ($\ge$~Ubuntu 16.04) and gcc/clang compilers ($\ge$~gcc 6.0), it is not the purpose of this project to find workarounds in these older systems to provide backwards compatibility. … … 129 129 130 130 \section{Contributions}\label{s:Contributions} 131 This work provides the following scheduling contributions for advanced \gls{uthrding} runtime -systems:131 This work provides the following scheduling contributions for advanced \gls{uthrding} runtime systems: 132 132 \begin{enumerate}[leftmargin=*] 133 133 \item … … 140 140 A mechanism for adding fairness on top of MQMS algorithm through helping, used both for scalable scheduling algorithm and the user-level \glsxtrshort{io}. 141 141 \item 142 An optimization of the helping -mechanism for load balancing to reduce scheduling costs.142 An optimization of the helping mechanism for load balancing to reduce scheduling costs. 143 143 \item 144 144 An optimization for the alternative relaxed-list for load balancing to reduce scheduling costs in embarrassingly parallel cases. -
doc/theses/thierry_delisle_PhD/thesis/text/io.tex
rebf8ca5 r23a08aa0 1 1 \chapter{User Level \io}\label{userio} 2 As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \ glspl{thrd}onto fewer \glspl{proc} using asynchronous \io operations.3 Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating -system.2 As mentioned in Section~\ref{prev:io}, user-level \io requires multiplexing the \io operations of many \ats onto fewer \glspl{proc} using asynchronous \io operations. 3 Different operating systems offer various forms of asynchronous operations and, as mentioned in Chapter~\ref{intro}, this work is exclusively focused on the Linux operating system. 4 4 5 5 \section{Kernel Interface} … … 13 13 In this context, ready means \emph{some} operation can be performed without blocking. 14 14 It does not mean an operation returning \lstinline{EAGAIN} succeeds on the next try. 15 For example, a ready read may only return a subset of requested bytes and the read must be issue sagain for the remaining bytes, at which point it may return \lstinline{EAGAIN}.}16 This mechanism is also crucial in determining when all \ glspl{thrd}are blocked and the application \glspl{kthrd} can now block.15 For example, a ready read may only return a subset of requested bytes and the read must be issued again for the remaining bytes, at which point it may return \lstinline{EAGAIN}.} 16 This mechanism is also crucial in determining when all \ats are blocked and the application \glspl{kthrd} can now block. 17 17 18 18 There are three options to monitor file descriptors in Linux:\footnote{ 19 19 For simplicity, this section omits \lstinline{pselect} and \lstinline{ppoll}. 20 The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.} ,20 The difference between these system calls and \lstinline{select} and \lstinline{poll}, respectively, is not relevant for this discussion.} 21 21 @select@~\cite{MAN:select}, @poll@~\cite{MAN:poll} and @epoll@~\cite{MAN:epoll}. 22 22 All three of these options offer a system call that blocks a \gls{kthrd} until at least one of many file descriptors becomes ready. … … 30 30 Hence, if one \gls{kthrd} is managing the select calls, other threads can only add/remove to/from the manager's interest set through synchronized calls to update the interest set. 31 31 However, these changes are only reflected when the manager makes its next call to @select@. 32 Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/ ttys it is waiting on never get data again.32 Note, it is possible for the manager thread to never unblock if its current interest set never changes, \eg the sockets/pipes/TTYs it is waiting on never get data again. 33 33 Often the I/O manager has a timeout, polls, or is sent a signal on changes to mitigate this problem. 34 35 \begin{comment}36 From: Tim Brecht <brecht@uwaterloo.ca>37 Subject: Re: FD sets38 Date: Wed, 6 Jul 2022 00:29:41 +000039 40 Large number of open files41 --------------------------42 43 In order to be able to use more than the default number of open file44 descriptors you may need to:45 46 o increase the limit on the total number of open files /proc/sys/fs/file-max47 (on Linux systems)48 49 o increase the size of FD_SETSIZE50 - the way I often do this is to figure out which include file __FD_SETSIZE51 is defined in, copy that file into an appropriate directory in ./include,52 and then modify it so that if you use -DBIGGER_FD_SETSIZE the larger size53 gets used54 55 For example on a RH 9.0 distribution I've copied56 /usr/include/bits/typesizes.h into ./include/i386-linux/bits/typesizes.h57 58 Then I modify typesizes.h to look something like:59 60 #ifdef BIGGER_FD_SETSIZE61 #define __FD_SETSIZE 3276762 #else63 #define __FD_SETSIZE 102464 #endif65 66 Note that the since I'm moving and testing the userver on may different67 machines the Makefiles are set up to use -I ./include/$(HOSTTYPE)68 69 This way if you redefine the FD_SETSIZE it will get used instead of the70 default original file.71 \end{comment}72 34 73 35 \paragraph{\lstinline{poll}} is the next oldest option, and takes as input an array of structures containing the FD numbers rather than their position in an array of bits, allowing a more compact input for interest sets that contain widely spaced FDs. 74 36 For small interest sets with densely packed FDs, the @select@ bit mask can take less storage, and hence, copy less information into the kernel. 75 Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialize on every call.76 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \gls {kthrd}, while a manager thread is blocked in @poll@.37 Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialized on every call. 38 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \glspl{kthrd}, while a manager thread is blocked in @poll@. 77 39 78 40 \paragraph{\lstinline{epoll}} follows after @poll@, and places the interest set in the kernel rather than the application, where it is managed by an internal \gls{kthrd}. … … 84 46 However, all three of these I/O systems have limitations. 85 47 The @man@ page for @O_NONBLOCK@ mentions that ``[@O_NONBLOCK@] has no effect for regular files and block devices'', which means none of these three system calls are viable multiplexing strategies for these types of \io operations. 86 Furthermore, @epoll@ has been shown to have problems with pipes and ttys~\cit{Peter's examples in some fashion}. 48 Furthermore, TTYs can also be tricky to use since they can take different forms based on how the command is executed. 49 For example, @epoll@ rejects FDs pointing to regular files or block devices, which includes @stdin@ when using shell redirections~\cite[\S~3.6]{MAN:bash}, but does not reject shell pipelines~\cite[\S~3.2.3]{MAN:bash}, which includes pipelines into @stdin@. 87 50 Finally, none of these are useful solutions for multiplexing \io operations that do not have a corresponding file descriptor and can be awkward for operations using multiple file descriptors. 88 51 … … 90 53 An alternative to @O_NONBLOCK@ is the AIO interface. 91 54 Its interface lets programmers enqueue operations to be performed asynchronously by the kernel. 92 Completions of these operations can be communicated in various ways: either by spawning a new \gls{kthrd}, sending a Linux signal, or by polling for completion of one or more operation.93 For this work, spawning a new \gls{kthrd} is counter -productive but a related solution is discussed in Section~\ref{io:morethreads}.94 Using interrupt s handlers can also lead to fairly complicated interactions between subsystems and hasnon-trivial cost.55 Completions of these operations can be communicated in various ways: either by spawning a new \gls{kthrd}, sending a Linux signal, or polling for completion of one or more operations. 56 For this work, spawning a new \gls{kthrd} is counterproductive but a related solution is discussed in Section~\ref{io:morethreads}. 57 Using interrupt handlers can also lead to fairly complicated interactions between subsystems and has a non-trivial cost. 95 58 Leaving polling for completion, which is similar to the previous system calls. 96 59 AIO only supports read and write operations to file descriptors, it does not have the same limitation as @O_NONBLOCK@, \ie, the file descriptors can be regular files and blocked devices. 97 60 It also supports batching multiple operations in a single system call. 98 61 99 AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have completed.100 For the purpose of\io multiplexing, @aio_suspend@ is the best interface.62 AIO offers two different approaches to polling: @aio_error@ can be used as a spinning form of polling, returning @EINPROGRESS@ until the operation is completed, and @aio_suspend@ can be used similarly to @select@, @poll@ or @epoll@, to wait until one or more requests have been completed. 63 For \io multiplexing, @aio_suspend@ is the best interface. 101 64 However, even if AIO requests can be submitted concurrently, @aio_suspend@ suffers from the same limitation as @select@ and @poll@, \ie, the interest set cannot be dynamically changed while a call to @aio_suspend@ is in progress. 102 AIO also suffers from the limitation of specifying which requests have completed, \ie programmers have to poll each request in the interest set using @aio_error@ to identify the completed requests.65 AIO also suffers from the limitation of specifying which requests have been completed, \ie programmers have to poll each request in the interest set using @aio_error@ to identify the completed requests. 103 66 This limitation means that, like @select@ and @poll@ but not @epoll@, the time needed to examine polling results increases based on the total number of requests monitored, not the number of completed requests. 104 67 Finally, AIO does not seem to be a popular interface, which I believe is due in part to this poor polling interface. … … 124 87 in 125 88 ``some kind of arbitrary \textit{queue up asynchronous system call} model''. 126 This description is actuallyquite close to the interface described in the next section.89 This description is quite close to the interface described in the next section. 127 90 128 91 \subsection{\lstinline{io_uring}} … … 135 98 In addition to supporting reads and writes to any file descriptor like AIO, it supports other operations like @open@, @close@, @fsync@, @accept@, @connect@, @send@, @recv@, @splice@, \etc. 136 99 137 On top of these, @io_uring@ adds many extras like avoiding copies between the kernel and user -space using shared memory, allowing different mechanisms to communicate with device drivers, and supporting chains of requests, \ie, requests that automatically trigger followup requests on completion.100 On top of these, @io_uring@ adds many extras like avoiding copies between the kernel and user space using shared memory, allowing different mechanisms to communicate with device drivers, and supporting chains of requests, \ie, requests that automatically trigger follow-up requests on completion. 138 101 139 102 \subsection{Extra Kernel Threads}\label{io:morethreads} 140 Finally, if the operating system does not offer a satisfactory form of asynchronous \io operations, an ad -hoc solution is to create a pool of \glspl{kthrd} and delegate operations to it to avoid blocking \glspl{proc}, which is a compromise for multiplexing.141 In the worst case, where all \ glspl{thrd}are consistently blocking on \io, it devolves into 1-to-1 threading.142 However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \ glspl{thrd}are ready to run.103 Finally, if the operating system does not offer a satisfactory form of asynchronous \io operations, an ad hoc solution is to create a pool of \glspl{kthrd} and delegate operations to it to avoid blocking \glspl{proc}, which is a compromise for multiplexing. 104 In the worst case, where all \ats are consistently blocking on \io, it devolves into 1-to-1 threading. 105 However, regardless of the frequency of \io operations, it achieves the fundamental goal of not blocking \glspl{proc} when \ats are ready to run. 143 106 This approach is used by languages like Go~\cite{GITHUB:go}, frameworks like libuv~\cite{libuv}, and web servers like Apache~\cite{apache} and NGINX~\cite{nginx}, since it has the advantage that it can easily be used across multiple operating systems. 144 107 This advantage is especially relevant for languages like Go, which offer a homogeneous \glsxtrshort{api} across all platforms. 145 As opposed to C, which has a very limited standard apifor \io, \eg, the C standard library has no networking.108 As opposed to C, which has a very limited standard \glsxtrshort{api} for \io, \eg, the C standard library has no networking. 146 109 147 110 \subsection{Discussion} … … 155 118 \section{Event-Engine} 156 119 An event engine's responsibility is to use the kernel interface to multiplex many \io operations onto few \glspl{kthrd}. 157 In concrete terms, this means \ glspl{thrd} enter the engine through an interface, the event engine then starts an operation and parks the calling \glspl{thrd}, returning control to the \gls{proc}.158 The parked \ glspl{thrd} are then rescheduled by the event engine once the desired operation hascompleted.120 In concrete terms, this means \ats enter the engine through an interface, the event engine then starts an operation and parks the calling \ats, returning control to the \gls{proc}. 121 The parked \ats are then rescheduled by the event engine once the desired operation has been completed. 159 122 160 123 \subsection{\lstinline{io_uring} in depth}\label{iouring} … … 171 134 \centering 172 135 \input{io_uring.pstex_t} 173 \caption[Overview of \lstinline{io_uring}]{Overview of \lstinline{io_uring} \smallskip\newline Two ring buffer are used to communicate with the kernel, one for completions~(right) and one for submissions~(left). The submission ring indexes into a pre-allocated array (denoted \emph{S}) instead.}136 \caption[Overview of \lstinline{io_uring}]{Overview of \lstinline{io_uring} \smallskip\newline Two ring buffers are used to communicate with the kernel, one for completions~(right) and one for submissions~(left). The submission ring indexes into a pre-allocated array (denoted \emph{S}) instead.} 174 137 \label{fig:iouring} 175 138 \end{figure} … … 184 147 \item 185 148 The SQE is filled according to the desired operation. 186 This step is straight 187 The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled in orderto match submission and completion entries.149 This step is straightforward. 150 The only detail worth mentioning is that SQEs have a @user_data@ field that must be filled to match submission and completion entries. 188 151 \item 189 152 The SQE is submitted to the submission ring by appending the index of the SQE to the ring following regular ring buffer steps: \lstinline{buffer[head] = item; head++}. … … 207 170 208 171 The @io_uring_enter@ system call is protected by a lock inside the kernel. 209 This protection means that concurrent call to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@.172 This protection means that concurrent calls to @io_uring_enter@ using the same instance are possible, but there is no performance gained from parallel calls to @io_uring_enter@. 210 173 It is possible to do the first three submission steps in parallel; 211 174 however, doing so requires careful synchronization. … … 216 179 This restriction means \io request bursts may have to be subdivided and submitted in chunks at a later time. 217 180 218 An important detail to keep in mind is that just like ``The cloud is just someone else's computer''\cite{xkcd:cloud}, asynchronous operations are just operation using someone else's threads.219 Indeed, asynchronous operation can require computation time to complete, which means that if this time is not taken from the thread that triggered the asynchronous operation, it must be taken from some other threads.181 An important detail to keep in mind is that just like ``The cloud is just someone else's computer''\cite{xkcd:cloud}, asynchronous operations are just operations using someone else's threads. 182 Indeed, asynchronous operations can require computation time to complete, which means that if this time is not taken from the thread that triggered the asynchronous operation, it must be taken from some other threads. 220 183 In this case, the @io_uring@ operations that cannot be handled directly in the system call must be delegated to some other \gls{kthrd}. 221 184 To this end, @io_uring@ maintains multiple \glspl{kthrd} inside the kernel that are not exposed to the user. 222 Th ere are three kindof operations that can need the \glspl{kthrd}:185 Three kinds of operations that can need the \glspl{kthrd}: 223 186 224 187 \paragraph{Operations using} @IOSQE_ASYNC@. … … 228 191 This is also a fairly simple case. As mentioned earlier in this chapter, [@O_NONBLOCK@] has no effect for regular files and block devices. 229 192 @io_uring@ must also take this reality into account by delegating operations on regular files and block devices. 230 In fact @io_uring@ maintains a pool of \glspl{kthrd} dedicated to these operations, which are referred to as \newterm{bounded workers}.193 In fact, @io_uring@ maintains a pool of \glspl{kthrd} dedicated to these operations, which are referred to as \newterm{bounded workers}. 231 194 232 195 \paragraph{Unbounded operations that must be retried.} … … 235 198 @io_uring@ maintains a separate pool for these operations. 236 199 The \glspl{kthrd} in this pool are referred to as \newterm{unbounded workers}. 237 Unbounded workers are also responsible ofhandling operations using @IOSQE_ASYNC@.200 Unbounded workers are also responsible for handling operations using @IOSQE_ASYNC@. 238 201 239 202 @io_uring@ implicitly spawns and joins both the bounded and unbounded workers based on its evaluation of the needs of the workload. 240 203 This effectively encapsulates the work that is needed when using @epoll@. 241 Indeed, @io_uring@ does not change Linux's underlying handling of \io ope artions, it simply offers an asynchronous \glsxtrshort{api} on top of the existing system.204 Indeed, @io_uring@ does not change Linux's underlying handling of \io operations, it simply offers an asynchronous \glsxtrshort{api} on top of the existing system. 242 205 243 206 244 207 \subsection{Multiplexing \io: Submission} 245 208 246 The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made in the submission side.209 The submission side is the most complicated aspect of @io_uring@ and the completion side effectively follows from the design decisions made on the submission side. 247 210 While there is freedom in designing the submission side, there are some realities of @io_uring@ that must be taken into account. 248 211 It is possible to do the first steps of submission in parallel; … … 255 218 As described in Chapter~\ref{practice}, this does not translate into constant CPU usage.}. 256 219 Note that once an operation completes, there is nothing that ties it to the @io_uring@ instance that handled it. 257 There is nothing preventing a new operation with, \eg the same file descriptors toa different @io_uring@ instance.220 Nothing preventing a new operation, with for example the same file descriptor, to use a different @io_uring@ instance. 258 221 259 222 A complicating aspect of submission is @io_uring@'s support for chains of operations, where the completion of an operation triggers the submission of the next operation on the link. … … 263 226 Support for this feature can be fulfilled simply by supporting arbitrary user code between allocation and submission. 264 227 265 Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \ glspl{proc}, in decoupled pools, \ie, a pool of \glspl{proc} usea pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two.228 Similar to scheduling, sharding @io_uring@ instances can be done privately, \ie, one instance per \proc, in decoupled pools, \ie, a pool of \procs using a pool of @io_uring@ instances without one-to-one coupling between any given instance and any given \gls{proc}, or some mix of the two. 266 229 These three sharding approaches are analyzed. 267 230 268 231 \subsubsection{Private Instances} 269 232 The private approach creates one ring instance per \gls{proc}, \ie one-to-one coupling. 270 This alleviates the need for synchronization on the submissions, requiring only that \ glspl{thrd}are not time-sliced during submission steps.271 This requirement is the same as accessing @thread_local@ variables, where a \ gls{thrd}is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data.272 This failure is the serially reusable problem~\cite{SeriallyReusable}.273 Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in allocation order.\footnote{274 To remove this requirement, a \ gls{thrd} needs the ability to ``yield to a specific \gls{proc}'', \ie,park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.}233 This alleviates the need for synchronization on the submissions, requiring only that \ats are not time-sliced during submission steps. 234 This requirement is the same as accessing @thread_local@ variables, where a \at is accessing kernel-thread data, is time-sliced, and continues execution on another kernel thread but is now accessing the wrong data. 235 This failure is the \newterm{serially reusable problem}~\cite{SeriallyReusable}. 236 Hence, allocated SQEs must be submitted to the same ring on the same \gls{proc}, which effectively forces the application to submit SQEs in order of allocation.\footnote{ 237 To remove this requirement, a \at needs the ability to ``yield to a specific \gls{proc}'', \ie, \park with the guarantee it unparks on a specific \gls{proc}, \ie the \gls{proc} attached to the correct ring.} 275 238 From the subsystem's point of view, the allocation and submission are sequential, greatly simplifying both. 276 239 In this design, allocation and submission form a partitioned ring buffer as shown in Figure~\ref{fig:pring}. 277 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regard sto when to perform the system call.278 Possible options are: when the \gls{proc} runs out of \ glspl{thrd} to run, after running a given number of \glspl{thrd}, \etc.240 Once added to the ring buffer, the attached \gls{proc} has a significant amount of flexibility with regard to when to perform the system call. 241 Possible options are: when the \gls{proc} runs out of \ats to run, after running a given number of \ats, \etc. 279 242 280 243 \begin{figure} 281 244 \centering 282 245 \input{pivot_ring.pstex_t} 283 \caption[Partitioned ring buffer]{Partitioned ring buffer \smallskip\newline Allocated sqes are append ingto the first partition.246 \caption[Partitioned ring buffer]{Partitioned ring buffer \smallskip\newline Allocated sqes are appended to the first partition. 284 247 When submitting, the partition is advanced. 285 248 The kernel considers the partition as the head of the ring.} … … 288 251 289 252 This approach has the advantage that it does not require much of the synchronization needed in a shared approach. 290 However, this benefit means \ glspl{thrd} submitting \io operations have less flexibility: they cannotpark or yield, and several exceptional cases are handled poorly.291 Instances running out of SQEs cannot run \ glspl{thrd}wanting to do \io operations.292 In this case, the \io \ gls{thrd}needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed.253 However, this benefit means \ats submitting \io operations have less flexibility: they cannot \park or yield, and several exceptional cases are handled poorly. 254 Instances running out of SQEs cannot run \ats wanting to do \io operations. 255 In this case, the \io \at needs to be moved to a different \gls{proc}, and the only current way of achieving this is to @yield()@ hoping to be scheduled on a different \gls{proc} with free SQEs, which is not guaranteed. 293 256 294 257 A more involved version of this approach tries to solve these problems using a pattern called \newterm{helping}. 295 \ Glspl{thrd} that cannot submit \io operations, either because of an allocation failure or migrationto a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster.296 While there is still the strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \glspl{thrd}to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request.297 298 Imagine a simple scenario with two \ glspl{thrd} on two \glspl{proc}, where one \gls{thrd} submits an \io operation and then sets a flag, while the other \gls{thrd}spins until the flag is set.299 Assume both \ glspl{thrd} are running on the same \gls{proc}, and the \io \gls{thrd} is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \gls{thrd}.300 In this case, the helping solution has the \io \ gls{thrd}append an \io object to the submission list of the first \gls{proc}, where the allocation was made.301 No other \gls{proc} can help the \ gls{thrd}since @io_uring@ instances are strongly coupled to \glspl{proc}.302 However, the \io \gls{proc} is unable to help because it is executing the spinning \ gls{thrd}resulting in a deadlock.303 While this example is artificial, in the presence of many \ glspl{thrd}, it is possible for this problem toarise ``in the wild''.258 \ats that cannot submit \io operations, either because of an allocation failure or \glslink{atmig}{migration} to a different \gls{proc} between allocation and submission, create an \io object and add it to a list of pending submissions per \gls{proc} and a list of pending allocations, probably per cluster. 259 While there is still a strong coupling between \glspl{proc} and @io_uring@ instances, these data structures allow moving \ats to a specific \gls{proc}, when the current \gls{proc} cannot fulfill the \io request. 260 261 Imagine a simple scenario with two \ats on two \glspl{proc}, where one \at submits an \io operation and then sets a flag, while the other \at spins until the flag is set. 262 Assume both \ats are running on the same \gls{proc}, and the \io \at is preempted between allocation and submission, moved to the second \gls{proc}, and the original \gls{proc} starts running the spinning \at. 263 In this case, the helping solution has the \io \at append an \io object to the submission list of the first \gls{proc}, where the allocation was made. 264 No other \gls{proc} can help the \at since @io_uring@ instances are strongly coupled to \glspl{proc}. 265 However, the \io \gls{proc} is unable to help because it is executing the spinning \at resulting in a deadlock. 266 While this example is artificial, in the presence of many \ats, this problem can arise ``in the wild''. 304 267 Furthermore, this pattern is difficult to reliably detect and avoid. 305 Once in this situation, the only escape is to interrupt ed the spinning \gls{thrd}, either directly or via some regular preemption, \eg time slicing.306 Having to interrupt \ glspl{thrd}for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect.268 Once in this situation, the only escape is to interrupt the spinning \at, either directly or via some regular preemption, \eg time slicing. 269 Having to interrupt \ats for this purpose is costly, the latency can be large between interrupts, and the situation may be hard to detect. 307 270 Interrupts are needed here entirely because the \gls{proc} is tied to an instance it is not using. 308 Therefore, a more satisfying solution is for the \ gls{thrd}submitting the operation to notice that the instance is unused and simply go ahead and use it.271 Therefore, a more satisfying solution is for the \at submitting the operation to notice that the instance is unused and simply go ahead and use it. 309 272 This approach is presented shortly. 310 273 311 274 \subsubsection{Public Instances} 312 275 The public approach creates decoupled pools of @io_uring@ instances and processors, \ie without one-to-one coupling. 313 \ Glspl{thrd}attempting an \io operation pick one of the available instances and submit the operation to that instance.314 Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \ glspl{thrd}running on more than one \gls{proc} can attempt to submit to the same instance concurrently.276 \ats attempting an \io operation pick one of the available instances and submit the operation to that instance. 277 Since there is no coupling between @io_uring@ instances and \glspl{proc} in this approach, \ats running on more than one \gls{proc} can attempt to submit to the same instance concurrently. 315 278 Because @io_uring@ effectively sets the amount of sharding needed to avoid contention on its internal locks, performance in this approach is based on two aspects: 316 279 \begin{itemize} … … 319 282 \item 320 283 The scheme to route \io requests to specific @io_uring@ instances does not introduce contention. 321 This aspect has anoversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm.284 This aspect has oversized importance because it comes into play before the sharding of instances, and as such, all \glspl{hthrd} can contend on the routing algorithm. 322 285 \end{itemize} 323 286 324 287 Allocation in this scheme is fairly easy. 325 Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written to safely and have a field called @user_data@ that the kernel only reads to copy to @cqe@s.326 Allocation also requires no ordering guaranteeas all free SQEs are interchangeable.288 Free SQEs, \ie, SQEs that are not currently being used to represent a request, can be written-to safely and have a field called @user_data@ that the kernel only reads to copy to CQEs. 289 Allocation also does not require ordering guarantees as all free SQEs are interchangeable. 327 290 The only added complexity is that the number of SQEs is fixed, which means allocation can fail. 328 291 329 Allocation failures need to be pushed to a routing algorithm: \ glspl{thrd}attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available.330 Furthermore, the routing algorithm should block operations up -front,if none of the instances have available SQEs.331 332 Once an SQE is allocated, \ glspl{thrd} insert the \io request information,and keep track of the SQE index and the instance it belongs to.333 334 Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread -safe, and then the kernel must be notified using the @io_uring_enter@ system call.335 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean a \lstinline{sqe}multiple times in the ring buffer, which is undefined behaviour.336 However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations complete.292 Allocation failures need to be pushed to a routing algorithm: \ats attempting \io operations must not be directed to @io_uring@ instances without sufficient SQEs available. 293 Furthermore, the routing algorithm should block operations upfront if none of the instances have available SQEs. 294 295 Once an SQE is allocated, \ats insert the \io request information and keep track of the SQE index and the instance it belongs to. 296 297 Once an SQE is filled in, it is added to the submission ring buffer, an operation that is not thread safe, and then the kernel must be notified using the @io_uring_enter@ system call. 298 The submission ring buffer is the same size as the pre-allocated SQE buffer, therefore pushing to the ring buffer cannot fail because it would mean an SQE multiple times in the ring buffer, which is undefined behaviour. 299 However, as mentioned, the system call itself can fail with the expectation that it can be retried once some submitted operations are complete. 337 300 338 301 Since multiple SQEs can be submitted to the kernel at once, it is important to strike a balance between batching and latency. 339 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long period of times before being submitted.340 Balancing submission can be handled by either designating one of the submitting \ glspl{thrd} as the being responsible for the system call for the current batch of SQEs or by having some other party regularly submitting all ready SQEs, \eg, the poller \gls{thrd}mentioned later in this section.341 342 Ideally, when multiple \ glspl{thrd} attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \glspl{thrd}is designated to do the system call on behalf of the others, called the \newterm{submitter}.302 Operations that are ready to be submitted should be batched together in few system calls, but at the same time, operations should not be left pending for long periods before being submitted. 303 Balancing submission can be handled by either designating one of the submitting \ats as the \at responsible for the system call for the current batch of SQEs or by having some other party regularly submit all ready SQEs, \eg, the poller \at mentioned later in this section. 304 305 Ideally, when multiple \ats attempt to submit operations to the same @io_uring@ instance, all requests should be batched together and one of the \ats is designated to do the system call on behalf of the others, called the \newterm{submitter}. 343 306 However, in practice, \io requests must be handed promptly so there is a need to guarantee everything missed by the current submitter is seen by the next one. 344 Indeed, as long as there is a ``next'' submitter, \ glspl{thrd}submitting new \io requests can move on, knowing that some future system call includes their request.345 Once the system call is done, the submitter must also free SQEs so that the allocator can reuse dthem.346 347 Finally, the completion side is much simpler since the @io_uring@ system -call enforces a natural synchronization point.348 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \ glspl{thrd}.349 Since CQEs only own a signed 32 307 Indeed, as long as there is a ``next'' submitter, \ats submitting new \io requests can move on, knowing that some future system call includes their request. 308 Once the system call is done, the submitter must also free SQEs so that the allocator can reuse them. 309 310 Finally, the completion side is much simpler since the @io_uring@ system call enforces a natural synchronization point. 311 Polling simply needs to regularly do the system call, go through the produced CQEs and communicate the result back to the originating \ats. 312 Since CQEs only own a signed 32-bit result, in addition to the copy of the @user_data@ field, all that is needed to communicate the result is a simple future~\cite{wiki:future}. 350 313 If the submission side does not designate submitters, polling can also submit all SQEs as it is polling events. 351 A simple approach to polling is to allocate a \ gls{thrd} per @io_uring@ instance and simply let the poller \glspl{thrd}poll their respective instances when scheduled.314 A simple approach to polling is to allocate a \at per @io_uring@ instance and simply let the poller \ats poll their respective instances when scheduled. 352 315 353 316 With the pool of SEQ instances approach, the big advantage is that it is fairly flexible. 354 It does not impose restrictions on what \ glspl{thrd}submitting \io operations can and cannot do between allocations and submissions.317 It does not impose restrictions on what \ats submitting \io operations can and cannot do between allocations and submissions. 355 318 It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@. 356 The down 357 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \ glspl{thrd}are already queued up waiting for SQEs and handle SQEs being freed.319 The downside to this approach is that many of the steps used for submitting need complex synchronization to work properly. 320 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \ats are already queued up waiting for SQEs and handle SQEs being freed. 358 321 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused, and handle the kernel returning @EBUSY@. 359 All this synchronization has a significant cost, and compared to the private-instance approach,this synchronization is entirely overhead.322 Compared to the private-instance approach, all this synchronization has a significant cost this synchronization is entirely overhead. 360 323 361 324 \subsubsection{Instance borrowing} 362 325 Both of the prior approaches have undesirable aspects that stem from tight or loose coupling between @io_uring@ and \glspl{proc}. 363 326 The first approach suffers from tight coupling causing problems when a \gls{proc} does not benefit from the coupling. 364 The second approach suffers from loose coupling causing operations to have synchronization overhead, which tighter coupling avoids.327 The second approach suffers from loose couplings causing operations to have synchronization overhead, which tighter coupling avoids. 365 328 When \glspl{proc} are continuously issuing \io operations, tight coupling is valuable since it avoids synchronization costs. 366 329 However, in unlikely failure cases or when \glspl{proc} are not using their instances, tight coupling is no longer advantageous. … … 370 333 371 334 In this approach, each cluster, see Figure~\ref{fig:system}, owns a pool of @io_uring@ instances managed by an \newterm{arbiter}. 372 When a \ gls{thrd}attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance.373 This instance is now bound to the \gls{proc} the \ gls{thrd}is running on.374 This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial state with respect to \io.335 When a \at attempts to issue an \io operation, it ask for an instance from the arbiter and issues requests to that instance. 336 This instance is now bound to the \gls{proc} the \at is running on. 337 This binding is kept until the arbiter decides to revoke it, taking back the instance and reverting the \gls{proc} to its initial \io state. 375 338 This tight coupling means that synchronization can be minimal since only one \gls{proc} can use the instance at a time, akin to the private instances approach. 376 339 However, it differs in that revocation by the arbiter means this approach does not suffer from the deadlock scenario described above. … … 380 343 \item The current \gls{proc} does not hold an instance. 381 344 \item The current instance does not have sufficient SQEs to satisfy the request. 382 \item The current \gls{proc} has a wrong instance, this happens if the submitting \ gls{thrd}context-switched between allocation and submission, called \newterm{external submissions}.345 \item The current \gls{proc} has a wrong instance, this happens if the submitting \at context-switched between allocation and submission, called \newterm{external submissions}. 383 346 \end{enumerate} 384 347 However, even when the arbiter is not directly needed, \glspl{proc} need to make sure that their instance ownership is not being revoked, which is accomplished by a lock-\emph{less} handshake.\footnote{ 385 Note the handshake is not lock 348 Note the handshake is not lock-\emph{free} since it lacks the proper progress guarantee.} 386 349 A \gls{proc} raises a local flag before using its borrowed instance and checks if the instance is marked as revoked or if the arbiter has raised its flag. 387 350 If not, it proceeds, otherwise it delegates the operation to the arbiter. … … 389 352 390 353 Correspondingly, before revoking an instance, the arbiter marks the instance and then waits for the \gls{proc} using it to lower its local flag. 391 Only then does it reclaim the instance and potentially assign it to an 354 Only then does it reclaim the instance and potentially assign it to another \gls{proc}. 392 355 393 356 The arbiter maintains four lists around which it makes its decisions: … … 406 369 407 370 \paragraph{Pending Allocations} are handled by the arbiter when it has available instances and can directly hand over the instance and satisfy the request. 408 Otherwise, it must hold on to the list of threads until SQEs are made available again.371 Otherwise, it must hold on to the list of threads until SQEs are made available again. 409 372 This handling is more complex when an allocation requires multiple SQEs, since the arbiter must make a decision between satisfying requests in FIFO ordering or for fewer SQEs. 410 373 411 374 While an arbiter has the potential to solve many of the problems mentioned above, it also introduces a significant amount of complexity. 412 Tracking which processors are borrowing which instances and which instances have SQEs available ends -up adding a significant synchronization prelude to any I/O operation.375 Tracking which processors are borrowing which instances and which instances have SQEs available ends up adding a significant synchronization prelude to any I/O operation. 413 376 Any submission must start with a handshake that pins the currently borrowed instance, if available. 414 377 An attempt to allocate is then made, but the arbiter can concurrently be attempting to allocate from the same instance from a different \gls{hthrd}. 415 378 Once the allocation is completed, the submission must check that the instance is still burrowed before attempting to flush. 416 379 These synchronization steps turn out to have a similar cost to the multiple shared-instances approach. 417 Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end -up cycling the processors, which leads to significant cache deterioration.380 Furthermore, if the number of instances does not match the number of processors actively submitting I/O, the system can fall into a state where instances are constantly being revoked and end up cycling the processors, which leads to significant cache deterioration. 418 381 For these reasons, this approach, which sounds promising on paper, does not improve on the private instance approach in practice. 419 420 \subsubsection{Private Instances V2}421 422 % Verbs of this design423 424 % Allocation: obtaining an sqe from which to fill in the io request, enforces the io instance to use since it must be the one which provided the sqe. Must interact with the arbiter if the instance does not have enough sqe for the allocation. (Typical allocation will ask for only one sqe, but chained sqe must be allocated from the same context so chains of sqe must be allocated in bulks)425 426 % Submission: simply adds the sqe(s) to some data structure to communicate that they are ready to go. This operation can't fail because there are as many spots in the submit buffer than there are sqes. Must interact with the arbiter only if the thread was moved between the allocation and the submission.427 428 % Flushing: Taking all the sqes that were submitted and making them visible to the kernel, also counting them in order to figure out what to_submit should be. Must be thread-safe with submission. Has to interact with the Arbiter if there are external submissions. Can't simply use a protected queue because adding to the array is not safe if the ring is still available for submitters. Flushing must therefore: check if there are external pending requests if so, ask the arbiter to flush otherwise use the fast flush operation.429 430 % Collect: Once the system call is done, it returns how many sqes were consumed by the system. These must be freed for allocation. Must interact with the arbiter to notify that things are now ready.431 432 % Handle: process all the produced cqe. No need to interact with any of the submission operations or the arbiter.433 434 435 % alloc():436 % proc.io->in_use = true, __ATOMIC_ACQUIRE437 % if cltr.io.flag || !proc.io || proc.io->flag:438 % return alloc_slow(cltr.io, proc.io)439 440 % a = alloc_fast(proc.io)441 % if a:442 % proc.io->in_use = false, __ATOMIC_RELEASE443 % return a444 445 % return alloc_slow(cltr.io)446 447 % alloc_fast()448 % left = proc.io->submit_q.free.tail - proc.io->submit_q.free.head449 % if num_entries - left < want:450 % return None451 452 % a = ready[head]453 % head = head + 1, __ATOMIC_RELEASE454 455 % alloc_slow()456 % cltr.io.flag = true, __ATOMIC_ACQUIRE457 % while(proc.io && proc.io->in_use) pause;458 459 460 461 % submit(a):462 % proc.io->in_use = true, __ATOMIC_ACQUIRE463 % if cltr.io.flag || proc.io != alloc.io || proc.io->flag:464 % return submit_slow(cltr.io)465 466 % submit_fast(proc.io, a)467 % proc.io->in_use = false, __ATOMIC_RELEASE468 469 % polling()470 % loop:471 % yield472 % flush()473 % io_uring_enter474 % collect475 % handle()476 382 477 383 \section{Interface} 478 384 The last important part of the \io subsystem is its interface. 479 There are multiple approaches thatcan be offered to programmers, each with advantages and disadvantages.480 The new \io subsystem can replace the C runtime API or extend it, and in the lat er case, the interface can go from very similar to vastly different.385 Multiple approaches can be offered to programmers, each with advantages and disadvantages. 386 The new \io subsystem can replace the C runtime API or extend it, and in the latter case, the interface can go from very similar to vastly different. 481 387 The following sections discuss some useful options using @read@ as an example. 482 The standard Linux interface for C is 388 The standard Linux interface for C is: 483 389 \begin{cfa} 484 390 ssize_t read(int fd, void *buf, size_t count); … … 492 398 However, this approach also entails a plethora of subtle technical challenges, which generally boils down to making a perfect replacement. 493 399 If the \CFA interface replaces only \emph{some} of the calls to glibc, then this can easily lead to esoteric concurrency bugs. 494 Since the gcc ecosystem sdoes not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible.400 Since the gcc ecosystem does not offer a scheme for perfect replacement, this approach was rejected as being laudable but infeasible. 495 401 496 402 \subsection{Synchronous Extension} … … 503 409 It comes with the caveat that any code attempting to use it must be recompiled, which is a problem considering the amount of existing legacy C binaries. 504 410 However, it has the advantage of implementation simplicity. 505 Finally, there is a certain irony to using a blocking synchronous interface sfor a feature often referred to as ``non-blocking'' \io.411 Finally, there is a certain irony to using a blocking synchronous interface for a feature often referred to as ``non-blocking'' \io. 506 412 507 413 \subsection{Asynchronous Extension} … … 531 437 This offers more flexibility to users wanting to fully utilize all of the @io_uring@ features. 532 438 However, it is not the most user-friendly option. 533 It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restrict ingusers to usages that are compatible with how \CFA internally uses @io_uring@.439 It obviously imposes a strong dependency between user code and @io_uring@ but at the same time restricts users to usages that are compatible with how \CFA internally uses @io_uring@. -
doc/theses/thierry_delisle_PhD/thesis/text/practice.tex
rebf8ca5 r23a08aa0 16 16 } // delete 4 kernel threads 17 17 \end{cfa} 18 Dynamically allocated processors can be deleted a nany time, \ie their lifetime exceeds the block of creation.18 Dynamically allocated processors can be deleted at any time, \ie their lifetime exceeds the block of creation. 19 19 The consequence is that the scheduler and \io subsystems must know when these \procs come in and out of existence and roll them into the appropriate scheduling algorithms. 20 20 21 21 \section{Manual Resizing} 22 22 Manual resizing is expected to be a rare operation. 23 Programmers normally create/delete processors on a cluster sat startup/teardown.23 Programmers normally create/delete processors on a cluster at startup/teardown. 24 24 Therefore, dynamically changing the number of \procs is an appropriate moment to allocate or free resources to match the new state. 25 25 As such, all internal scheduling arrays that are sized based on the number of \procs need to be @realloc@ed. 26 This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or forany other reason.26 This requirement also means any references into these arrays, \eg pointers or indexes, may need to be updated if elements are moved for compaction or any other reason. 27 27 28 28 There are no performance requirements, within reason, for resizing since it is expected to be rare. 29 29 However, this operation has strict correctness requirements since updating and idle sleep can easily lead to deadlocks. 30 It should also avoid as much as possible any effect on performance when the number of \procs remain constant.30 It should also avoid as much as possible any effect on performance when the number of \procs remains constant. 31 31 This later requirement prohibits naive solutions, like simply adding a global lock to the ready-queue arrays. 32 32 33 33 \subsection{Read-Copy-Update} 34 34 One solution is to use the Read-Copy-Update pattern~\cite{wiki:rcu}. 35 In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempt an Indiana Jones Switch to replace the original with the copy.35 In this pattern, resizing is done by creating a copy of the internal data structures, \eg see Figure~\ref{fig:base-ts2}, updating the copy with the desired changes, and then attempting an Indiana Jones Switch to replace the original with the copy. 36 36 This approach has the advantage that it may not need any synchronization to do the switch. 37 37 However, there is a race where \procs still use the original data structure after the copy is switched. 38 This race not only requires adding a memory-reclamation scheme, it also requires that operations made on the stale original version are eventually moved to the copy.38 This race not only requires adding a memory-reclamation scheme, but it also requires that operations made on the stale original version are eventually moved to the copy. 39 39 40 40 Specifically, the original data structure must be kept until all \procs have witnessed the change. … … 42 42 If all operations need synchronization, then the overall cost of this technique is likely to be similar to an uncontended lock approach. 43 43 In addition to the classic challenge of memory reclamation, transferring the original data to the copy before reclaiming it poses additional challenges. 44 Especially merging sub queues while having a minimal impact on fairness and locality.45 46 For example, given a linked -list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure.44 Especially merging sub-queues while having a minimal impact on fairness and locality. 45 46 For example, given a linked list, having a node enqueued onto the original and new list is not necessarily a problem depending on the chosen list structure. 47 47 If the list supports arbitrary insertions, then inconsistencies in the tail pointer do not break the list; 48 48 however, ordering may not be preserved. … … 55 55 A simpler approach is to use a \newterm{Readers-Writer Lock}~\cite{wiki:rwlock}, where the resizing requires acquiring the lock as a writer while simply enqueueing/dequeuing \ats requires acquiring the lock as a reader. 56 56 Using a Readers-Writer lock solves the problem of dynamically resizing and leaves the challenge of finding or building a lock with sufficient good read-side performance. 57 Since this approach is not a very complex challenge and an ad -hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken.57 Since this approach is not a very complex challenge and an ad hoc solution is perfectly acceptable, building a Readers-Writer lock was the path taken. 58 58 59 59 To maximize reader scalability, readers should not contend with each other when attempting to acquire and release a critical section. 60 To achieve this goal requires each reader tohave its own memory to mark as locked and unlocked.61 The read 62 The write acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks.63 Acquiring all the local read 60 Achieving this goal requires that each reader have its own memory to mark as locked and unlocked. 61 The read-acquire possibly waits for a writer to finish the critical section and then acquires a reader's local spinlock. 62 The writer acquires the global lock, guaranteeing mutual exclusion among writers, and then acquires each of the local reader locks. 63 Acquiring all the local read-locks guarantees mutual exclusion among the readers and the writer, while the wait on the read side prevents readers from continuously starving the writer. 64 64 Figure~\ref{f:SpecializedReadersWriterLock} shows the outline for this specialized readers-writer lock. 65 65 The lock in nonblocking, so both readers and writers spin while the lock is held. 66 This very wide sharding strategy means that readers have very good locality , since they only ever need to access two memory location.66 This very wide sharding strategy means that readers have very good locality since they only ever need to access two memory locations. 67 67 68 68 \begin{figure} … … 98 98 \section{Idle-Sleep}\label{idlesleep} 99 99 While manual resizing of \procs is expected to be rare, the number of \ats can vary significantly over an application's lifetime, which means there are times when there are too few or too many \procs. 100 For this work, it is the program er's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue.100 For this work, it is the programmer's responsibility to manually create \procs, so if there are too few \procs, the application must address this issue. 101 101 This leaves too many \procs when there are not enough \ats for all the \procs to be useful. 102 102 These idle \procs cannot be removed because their lifetime is controlled by the application, and only the application knows when the number of \ats may increase or decrease. … … 108 108 Because idle sleep is spurious, this data structure has strict performance requirements, in addition to strict correctness requirements. 109 109 Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ on a pthread semaphore. 110 The complexity here is to support \at parking and unparking, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity.110 The complexity here is to support \at \glslink{atblock}{parking} and \glslink{atsched}{unparking}, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity. 111 111 Finally, the scheduler needs a heuristic to determine when to block and unblock an appropriate number of \procs. 112 112 However, this third challenge is outside the scope of this thesis because developing a general heuristic is complex enough to justify its own work. 113 113 Therefore, the \CFA scheduler simply follows the ``Race-to-Idle''~\cite{Albers12} approach where a sleeping \proc is woken any time a \at becomes ready and \procs go to idle sleep anytime they run out of work. 114 114 115 An interesting sub -part of this heuristic is what to do with bursts of \ats that become ready.116 Since waking up a sleeping \proc can have notable latency, it is possible multiple \atsbecome ready while a single \proc is waking up.117 This fact sbegs the question, if many \procs are available, how many should be woken?118 If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum paralleli sation.119 If the ready \ats will run for a shortvery short time, waking many \procs may be wasteful.115 An interesting subpart of this heuristic is what to do with bursts of \ats that become ready. 116 Since waking up a sleeping \proc can have notable latency, multiple \ats may become ready while a single \proc is waking up. 117 This fact begs the question, if many \procs are available, how many should be woken? 118 If the ready \ats will run longer than the wake-up latency, waking one \proc per \at will offer maximum parallelization. 119 If the ready \ats will run for a very short time, waking many \procs may be wasteful. 120 120 As mentioned, a heuristic to handle these complex cases is outside the scope of this thesis, the behaviour of the scheduler in this particular case is left unspecified. 121 121 122 122 \section{Sleeping} 123 As usual, the corner -stone of any feature related to the kernel is the choice of system call.123 As usual, the cornerstone of any feature related to the kernel is the choice of system call. 124 124 In terms of blocking a \gls{kthrd} until some event occurs, the Linux kernel has many available options. 125 125 126 126 \subsection{\lstinline{pthread_mutex}/\lstinline{pthread_cond}} 127 The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe park/unpark of a \gls{kthrd} to/from a @pthread_cond@.127 The classic option is to use some combination of the pthread mutual exclusion and synchronization locks, allowing a safe \park/\unpark of a \gls{kthrd} to/from a @pthread_cond@. 128 128 While this approach works for \glspl{kthrd} waiting among themselves, \io operations do not provide a mechanism to signal @pthread_cond@s. 129 For \io results to wake a \proc waiting on a @pthread_cond@ means a different \gls pl{kthrd} must be woken up first, which then signals the \proc.129 For \io results to wake a \proc waiting on a @pthread_cond@ means a different \gls{kthrd} must be woken up first, which then signals the \proc. 130 130 131 131 \subsection{\lstinline{io_uring} and Epoll} … … 137 137 138 138 \subsection{Event FDs} 139 Another interesting approach is to use an event file descriptor\cite{ eventfd}.139 Another interesting approach is to use an event file descriptor\cite{MAN:eventfd}. 140 140 This Linux feature is a file descriptor that behaves like \io, \ie, uses @read@ and @write@, but also behaves like a semaphore. 141 Indeed, all reads and writes must use aword-sized values, \ie 64 or 32 bits.142 Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero 141 Indeed, all reads and writes must use word-sized values, \ie 64 or 32 bits. 142 Writes \emph{add} their values to a buffer using arithmetic addition versus buffer append, and reads zero-out the buffer and return the buffer values so far.\footnote{ 143 143 This behaviour is without the \lstinline{EFD_SEMAPHORE} flag, which changes the behaviour of \lstinline{read} but is not needed for this work.} 144 144 If a read is made while the buffer is already 0, the read blocks until a non-0 value is added. … … 148 148 149 149 \section{Tracking Sleepers} 150 Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly it requires a concurrent \emph{handshake} so that no \at is stranded on a ready-queue with no active \proc.150 Tracking which \procs are in idle sleep requires a data structure holding all the sleeping \procs, but more importantly, it requires a concurrent \emph{handshake} so that no \at is stranded on a ready queue with no active \proc. 151 151 The classic challenge occurs when a \at is made ready while a \proc is going to sleep: there is a race where the new \at may not see the sleeping \proc and the sleeping \proc may not see the ready \at. 152 152 Since \ats can be made ready by timers, \io operations, or other events outside a cluster, this race can occur even if the \proc going to sleep is the only \proc awake. … … 154 154 155 155 The handshake closing the race is done with both the notifier and the idle \proc executing two ordered steps. 156 The notifier first make sure the newly ready \at is visible to \procs searching for \ats, and then attemptto notify an idle \proc.156 The notifier first makes sure the newly ready \at is visible to \procs searching for \ats, and then attempts to notify an idle \proc. 157 157 On the other side, \procs make themselves visible as idle \procs and then search for any \ats they may have missed. 158 158 Unlike regular work-stealing, this search must be exhaustive to make sure that pre-existing \at is missed. 159 159 These steps from both sides guarantee that if the search misses a newly ready \at, then the notifier is guaranteed to see at least one idle \proc. 160 Convers ly, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search.160 Conversely, if the notifier does not see any idle \proc, then a \proc is guaranteed to find the new \at in its exhaustive search. 161 161 162 162 Furthermore, the ``Race-to-Idle'' approach means that there may be contention on the data structure tracking sleepers. 163 Contention can be tolerated for \procs attempting to sleep or wake -up because these \procs are not doing useful work, and therefore, not contributing to overall performance.163 Contention can be tolerated for \procs attempting to sleep or wake up because these \procs are not doing useful work, and therefore, not contributing to overall performance. 164 164 However, notifying, checking if a \proc must be woken-up, and doing so if needed, can significantly affect overall performance and must be low cost. 165 165 166 166 \subsection{Sleepers List} 167 167 Each cluster maintains a list of idle \procs, organized as a stack. 168 This ordering allows \procs at the tail to stay in idle sleep for extended period of times while those at the head of the list wake-up for bursts of activity.168 This ordering allows \procs at the tail to stay in idle sleep for extended periods while those at the head of the list wake up for bursts of activity. 169 169 Because of unbalanced performance requirements, the algorithm tracking sleepers is designed to have idle \procs handle as much of the work as possible. 170 170 The idle \procs maintain the stack of sleepers among themselves and notifying a sleeping \proc takes as little work as possible. … … 173 173 174 174 This approach also simplifies notification. 175 Indeed, \procs not only need to be notif y when a new \at is readied, but also mustbe notified during manual resizing, so the \gls{kthrd} can be joined.175 Indeed, \procs not only need to be notified when a new \at is readied, but must also be notified during manual resizing, so the \gls{kthrd} can be joined. 176 176 These requirements mean whichever entity removes idle \procs from the sleeper list must be able to do so in any order. 177 177 Using a simple lock over this data structure makes the removal much simpler than using a lock-free data structure. 178 The single lock also means the notification process simply needs to wake -up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest.178 The single lock also means the notification process simply needs to wake up the desired idle \proc, using @pthread_cond_signal@, @write@ on an @fd@, \etc, and the \proc handles the rest. 179 179 180 180 \subsection{Reducing Latency} 181 As mentioned in this section, \procs going to sleep for extremely short periods of timeis likely in certain scenarios.182 Therefore, the latency of doing a system call to read from and writ ing to an event @fd@ can negatively affect overall performance in a notable way.181 As mentioned in this section, \procs going to sleep for extremely short periods is likely in certain scenarios. 182 Therefore, the latency of doing a system call to read from and write to an event @fd@ can negatively affect overall performance notably. 183 183 Hence, it is important to reduce latency and contention of the notification as much as possible. 184 184 Figure~\ref{fig:idle1} shows the basic idle-sleep data structure. … … 205 205 The woken \proc then updates the atomic pointer, while it is updating the head of the list, as it removes itself from the list. 206 206 Notifiers that obtained a @NULL@ in the exchange simply move on knowing that another notifier is already waking a \proc. 207 This behaviour is equivalent to having multiple notifier write to the @fd@ since reads consume all previous writes.208 Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares with tthe latency of \procs waking up.207 This behaviour is equivalent to having multiple notifiers write to the @fd@ since reads consume all previous writes. 208 Note that with and without this atomic pointer, bursts of notification can lead to an unspecified number of \procs being woken up, depending on how the arrival notification compares with the latency of \procs waking up. 209 209 As mentioned in section~\ref{idlesleep}, there is no optimal approach to handle these bursts. 210 210 It is therefore difficult to justify the cost of any extra synchronization here. … … 218 218 219 219 The next optimization is to avoid the latency of the event @fd@, which can be done by adding what is effectively a binary benaphore\cite{schillings1996engineering} in front of the event @fd@. 220 The benaphore over the event @fd@ logically provides a three state flag to avoid unnecessary system calls, where the states are expressed explicitin Figure~\ref{fig:idle:state}.221 A \proc begins its idle sleep by adding itself to the idle list before searching for a n\at.220 The benaphore over the event @fd@ logically provides a three-state flag to avoid unnecessary system calls, where the states are expressed explicitly in Figure~\ref{fig:idle:state}. 221 A \proc begins its idle sleep by adding itself to the idle list before searching for a \at. 222 222 In the process of adding itself to the idle list, it sets the state flag to @SEARCH@. 223 223 If no \ats can be found during the search, the \proc then confirms it is going to sleep by atomically swapping the state to @SLEEP@. 224 224 If the previous state is still @SEARCH@, then the \proc does read the event @fd@. 225 Meanwhile, notifiers atomically exchange the state to @AWAKE@ state.225 Meanwhile, notifiers atomically exchange the state to the @AWAKE@ state. 226 226 If the previous state is @SLEEP@, then the notifier must write to the event @fd@. 227 227 However, if the notify arrives almost immediately after the \proc marks itself idle, then both reads and writes on the event @fd@ can be omitted, which reduces latency notably. 228 These extensions lead sto the final data structure shown in Figure~\ref{fig:idle}.228 These extensions lead to the final data structure shown in Figure~\ref{fig:idle}. 229 229 230 230 \begin{figure} 231 231 \centering 232 232 \input{idle_state.pstex_t} 233 \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three 233 \caption[Improved Idle-Sleep Latency]{Improved Idle-Sleep Latency \smallskip\newline A three-state flag is added to the event \lstinline{fd}.} 234 234 \label{fig:idle:state} 235 235 \end{figure} -
doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex
rebf8ca5 r23a08aa0 4 4 \section{C Threading} 5 5 6 \Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@.6 \Celeven introduced threading features, such as the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@. 7 7 Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC). 8 8 While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC. … … 13 13 \section{M:N Threading}\label{prev:model} 14 14 15 Threading in \CFA is based on \Gls{uthrding}, where \ glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \glspl{thrd} and switch among \glspl{thrd} liberally without many concerns for performance.15 Threading in \CFA is based on \Gls{uthrding}, where \ats are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \ats and switch among \ats liberally without many performance concerns. 16 16 17 The \CFA M:N threading model sis implemented using many user-level threads mapped onto fewer \glspl{kthrd}.17 The \CFA M:N threading model is implemented using many user-level threads mapped onto fewer \glspl{kthrd}. 18 18 The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack. 19 The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \ gls{thrd} until it context switches out, it then chooses a different \gls{thrd}to run.19 The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \at until it context switches out, it then chooses a different \at to run. 20 20 21 21 \section{Clusters} 22 22 \CFA allows the option to group user-level threading, in the form of clusters. 23 Both \ glspl{thrd}and \glspl{proc} belong to a specific cluster.24 \Glspl{ thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters.23 Both \ats and \glspl{proc} belong to a specific cluster. 24 \Glspl{at} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. 25 25 Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. 26 26 It also opens the door to handling effects like NUMA, by pinning clusters to a specific NUMA node\footnote{This capability is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for CPU masks.}. … … 30 30 \input{system.pstex_t} 31 31 \end{center} 32 \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{ thrd} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} that lives outside any cluster and does not run \glspl{thrd}.}32 \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{at} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} that lives outside any cluster and does not run \ats.} 33 33 \label{fig:system} 34 34 \end{figure} … … 38 38 39 39 \section{\glsxtrshort{io}}\label{prev:io} 40 Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. While all \glsxtrshort{io} operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means \glsxtrshort{io} operations block \glspl{proc} instead of \ glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \gls{thrd}is ready to run. A simple example of this type of deadlock would be as follows:40 Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. While all \glsxtrshort{io} operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model~\cite{pthreads}. Using these 1:1 threading operations in an M:N threading model means \glsxtrshort{io} operations block \glspl{proc} instead of \ats. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \ats. It also means deadlock can occur because all \glspl{proc} are blocked even if at least one \at is ready to run. A simple example of this type of deadlock would be as follows: 41 41 42 42 \begin{quote} 43 Given a simple network program with 2 \ glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd}waits for a response from the server.44 If the second \ gls{thrd}races ahead, it may wait for responses to requests that have not been sent yet.45 In theory, this should not be a problem, even if the second \ gls{thrd} waits, because the first \gls{thrd}is still ready to run and should be able to get CPU time to send the request.46 With M:N threading, while the first \ gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}.47 If this happen , the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client.43 Given a simple network program with 2 \ats and a single \gls{proc}, one \at sends network requests to a server and the other \at waits for a response from the server. 44 If the second \at races ahead, it may wait for responses to requests that have not been sent yet. 45 In theory, this should not be a problem, even if the second \at waits, because the first \at is still ready to run and should be able to get CPU time to send the request. 46 With M:N threading, while the first \at is ready, the lone \gls{proc} \emph{cannot} run the first \at if it is blocked in the \glsxtrshort{io} operation of the second \at. 47 If this happens, the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client. 48 48 However, this solution is neither general nor appropriate even in this simple case.}. 49 49 \end{quote} 50 50 51 Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations.52 This feature entails multiplexing the \glsxtrshort{io} operations of many \ glspl{thrd}onto fewer \glspl{proc}.51 Therefore, one of the objectives of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \ats rather than \glspl{proc} when doing \glsxtrshort{io} operations. 52 This feature entails multiplexing the \glsxtrshort{io} operations of many \ats onto fewer \glspl{proc}. 53 53 The multiplexing requires a single \gls{proc} to execute multiple \glsxtrshort{io} operations in parallel. 54 54 This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. … … 60 60 All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions need not be thread-safe. ... (list of 70+ excluded functions) 61 61 \end{quote} 62 Only UNIX @man@ pages identify whether or not a library function is threadsafe, and hence, may block on a pthreads lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.62 Only UNIX @man@ pages identify whether a library function is thread-safe, and hence, may block on a pthreads lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model. 63 63 64 64 Languages like Go and Java, which have strict interoperability with C\cite{wiki:jni,go:cgo}, can control operations in C by ``sandboxing'' them, \eg a blocking function may be delegated to a \gls{kthrd}. Sandboxing may help towards guaranteeing that the kind of deadlock mentioned above does not occur. … … 72 72 Therefore, it is possible calls to an unknown library function can block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. 73 73 Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. 74 Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks areredirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.}74 Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks is redirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.} -
doc/theses/thierry_delisle_PhD/thesis/thesis.tex
rebf8ca5 r23a08aa0 84 84 \usepackage{subcaption} 85 85 \usepackage{comment} % Removes large sections of the document. 86 \usepackage{array} 86 87 87 88 % Hyperlinks make it very easy to navigate an electronic document. … … 210 211 \newcommand\proc{\gls{proc}\xspace}% 211 212 \newcommand\procs{\glspl{proc}\xspace}% 213 \newcommand\park{\glslink{atblock}{park}\xspace}% 214 \newcommand\unpark{\glslink{atsched}{unpark}\xspace}% 212 215 213 216 %====================================================================== -
driver/Makefile.am
rebf8ca5 r23a08aa0 19 19 20 20 # applies to both programs 21 AM_CXXFLAGS = @HOST_FLAGS@ -Wall -Wextra -Werror=return-type -O2 -g -std=c++1 4-I${abs_top_srcdir}/src -I${abs_top_srcdir}/src/include21 AM_CXXFLAGS = @HOST_FLAGS@ -Wall -Wextra -Werror=return-type -O2 -g -std=c++17 -I${abs_top_srcdir}/src -I${abs_top_srcdir}/src/include 22 22 23 23 # don't install cfa directly -
driver/cfa.cc
rebf8ca5 r23a08aa0 53 53 return arg.substr( 0, pre.size() ) == pre; 54 54 } // prefix 55 56 static inline bool ends_with(const string & str, const string & sfix) {57 if (sfix.size() > str.size()) return false;58 return std::equal(str.rbegin(), str.rbegin() + sfix.size(), sfix.rbegin(), sfix.rend());59 }60 55 61 56 // check if string has suffix -
libcfa/prelude/Makefile.am
rebf8ca5 r23a08aa0 50 50 51 51 prelude.cfa : prelude-gen.cc 52 ${AM_V_GEN}${CXX} ${AM_CXXFLAGS} ${CXXFLAGS} ${AM_CFLAGS} ${<} -o prelude-gen -Wall -Wextra -O2 -g -std=c++1 452 ${AM_V_GEN}${CXX} ${AM_CXXFLAGS} ${CXXFLAGS} ${AM_CFLAGS} ${<} -o prelude-gen -Wall -Wextra -O2 -g -std=c++17 53 53 @./prelude-gen > ${@} 54 54 @rm ./prelude-gen … … 76 76 77 77 if ENABLE_DISTCC 78 distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c $(srcdir)/../../tools/build/push2dist.sh78 distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ defines.hfa gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c $(srcdir)/../../tools/build/push2dist.sh 79 79 ${AM_V_GEN}$(srcdir)/../../tools/build/push2dist.sh @CFADIR_HASH@ @DIST_BWLIMIT@ 80 80 @echo "Dummy file to track distribution to remote hosts" > ${@} -
libcfa/prelude/defines.hfa.in
rebf8ca5 r23a08aa0 141 141 142 142 /* Defined if io_uring support is present when compiling libcfathread and 143 supports the flag IORING_REGISTER_IOWQ_MAX_WORKERS. */ 144 #undef CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS 145 146 /* Defined if io_uring support is present when compiling libcfathread and 143 147 supports the flag IORING_SETUP_ATTACH_WQ. */ 144 148 #undef CFA_HAVE_IORING_SETUP_ATTACH_WQ -
libcfa/src/Makefile.am
rebf8ca5 r23a08aa0 186 186 if ENABLE_DISTCC 187 187 188 ../prelude/distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ ../prelude/ gcc-builtins.cf ../prelude/builtins.cf ../prelude/extras.cf ../prelude/prelude.cfa ../prelude/bootloader.c $(srcdir)/../../tools/build/push2dist.sh188 ../prelude/distribution: @LOCAL_CFACC@ @LOCAL_CC1@ @CFACPP@ ../prelude/defines.hfa ../prelude/gcc-builtins.cf ../prelude/builtins.cf ../prelude/extras.cf ../prelude/prelude.cfa ../prelude/bootloader.c $(srcdir)/../../tools/build/push2dist.sh 189 189 @+make -C ../prelude distribution 190 190 -
libcfa/src/bits/defs.hfa
rebf8ca5 r23a08aa0 24 24 #define likely(x) __builtin_expect(!!(x), 1) 25 25 #define unlikely(x) __builtin_expect(!!(x), 0) 26 #define thread_local _Thread_local27 26 28 27 typedef void (*fptr_t)(); … … 37 36 #endif 38 37 38 39 #if defined(__has_attribute) 40 #if !__has_attribute(__noclone__) 41 #define ATTRIBUTE_NOCLONE 42 #endif 43 #endif 44 #ifndef ATTRIBUTE_NOCLONE 45 #define ATTRIBUTE_NOCLONE __attribute__((__noclone__)) 46 #endif 47 39 48 #define libcfa_public __attribute__((visibility("default"))) 49 #define libcfa_nopreempt __attribute__((section("cfatext_nopreempt"))) __attribute__((__noinline__)) ATTRIBUTE_NOCLONE 50 51 struct __cfa_nopreempt_region { 52 void * start; 53 void * stop; 54 }; 40 55 41 56 #ifdef __cforall -
libcfa/src/bits/locks.hfa
rebf8ca5 r23a08aa0 13 13 // Created On : Tue Oct 31 15:14:38 2017 14 14 // Last Modified By : Peter A. Buhr 15 // Last Modified On : Sat Aug 27 15:06:39202216 // Update Count : 1 515 // Last Modified On : Mon Sep 19 18:39:45 2022 16 // Update Count : 16 17 17 // 18 18 -
libcfa/src/concurrency/io/call.cfa.in
rebf8ca5 r23a08aa0 202 202 struct io_context$ * ctx = cfa_io_allocate( &sqe, &idx, 1 ); 203 203 204 memset(sqe, 0, sizeof(*sqe)); 204 205 sqe->opcode = IORING_OP_{op}; 206 sqe->flags = sflags; 205 207 sqe->user_data = (uintptr_t)&future; 206 sqe->flags = sflags; 207 sqe->ioprio = 0; 208 sqe->fd = 0; 209 sqe->off = 0; 210 sqe->addr = 0; 211 sqe->len = 0; 212 sqe->fsync_flags = 0; 213 sqe->__pad2[0] = 0; 214 sqe->__pad2[1] = 0; 215 sqe->__pad2[2] = 0;{body} 208 {body} 216 209 217 210 asm volatile("": : :"memory"); -
libcfa/src/concurrency/io/setup.cfa
rebf8ca5 r23a08aa0 228 228 229 229 #if !defined(CFA_WITH_IO_URING_IDLE) 230 { 230 231 // Step 4 : eventfd 231 232 __cfadbg_print_safe(io_core, "Kernel I/O : registering %d for completion with ring %d\n", procfd, fd); … … 237 238 238 239 __cfadbg_print_safe(io_core, "Kernel I/O : registered %d for completion with ring %d\n", procfd, fd); 239 #endif 240 240 } 241 #endif 242 243 // TODO: implement a proper version of this. 244 // I have not found a better maximum that works in general but users should be able to configure it 245 // the same way they configure other I/O options 241 246 // #if defined(CFA_HAVE_IORING_REGISTER_IOWQ_MAX_WORKERS) 247 // { 242 248 // // Step 5 : max worker count 243 249 // __cfadbg_print_safe(io_core, "Kernel I/O : lmiting max workers for ring %d\n", fd); … … 252 258 253 259 // __cfadbg_print_safe(io_core, "Kernel I/O : lmited max workers for ring %d\n", fd); 260 // } 254 261 // #endif 255 262 -
libcfa/src/concurrency/kernel/cluster.hfa
rebf8ca5 r23a08aa0 63 63 } 64 64 } 65 return (max + 2 * max) / 2;65 return 8 * max; 66 66 } 67 67 -
libcfa/src/concurrency/kernel/fwd.hfa
rebf8ca5 r23a08aa0 35 35 extern "C" { 36 36 extern "Cforall" { 37 extern __attribute__((aligned(64))) thread_localstruct KernelThreadData {37 extern __attribute__((aligned(64))) __thread struct KernelThreadData { 38 38 struct thread$ * volatile this_thread; 39 39 struct processor * volatile this_processor; … … 179 179 // Similar to a binary semaphore with a 'one shot' semantic 180 180 // is expected to be discarded after each party call their side 181 enum(struct thread$ *) { oneshot_ARMED = 0p, oneshot_FULFILLED = 1p }; 181 182 struct oneshot { 182 183 // Internal state : 183 // 0p : is initial state (wait will block)184 // 1p : fulfilled (wait won't block)184 // armed : initial state, wait will block 185 // fulfilled : wait won't block 185 186 // any thread : a thread is currently waiting 186 187 struct thread$ * volatile ptr; … … 189 190 static inline { 190 191 void ?{}(oneshot & this) { 191 this.ptr = 0p;192 this.ptr = oneshot_ARMED; 192 193 } 193 194 … … 199 200 for() { 200 201 struct thread$ * expected = this.ptr; 201 if(expected == 1p) return false;202 if(expected == oneshot_FULFILLED) return false; 202 203 if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { 203 204 park(); 204 /* paranoid */ verify( this.ptr == 1p);205 /* paranoid */ verify( this.ptr == oneshot_FULFILLED ); 205 206 return true; 206 207 } … … 211 212 // return true if a thread was unparked 212 213 thread$ * post(oneshot & this, bool do_unpark = true) { 213 struct thread$ * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);214 if( got == 0p || got == 1p) return 0p;214 struct thread$ * got = __atomic_exchange_n( &this.ptr, oneshot_FULFILLED, __ATOMIC_SEQ_CST); 215 if( got == oneshot_ARMED || got == oneshot_FULFILLED ) return 0p; 215 216 if(do_unpark) unpark( got ); 216 217 return got; … … 223 224 // thread on "any of" [a given set of] futures. 224 225 // does not support multiple threads waiting on the same future 226 enum(struct oneshot *) { future_ARMED = 0p, future_FULFILLED = 1p, future_PROGRESS = 2p, future_ABANDONED = 3p }; 225 227 struct future_t { 226 228 // Internal state : 227 // 0p : is initial state (wait will block)228 // 1p : fulfilled (wait won't block)229 // 2p : in progress ()230 // 3p : abandoned, server should delete229 // armed : initial state, wait will block 230 // fulfilled : result is ready, wait won't block 231 // progress : someone else is in the process of fulfilling this 232 // abandoned : client no longer cares, server should delete 231 233 // any oneshot : a context has been setup to wait, a thread could wait on it 232 234 struct oneshot * volatile ptr; … … 235 237 static inline { 236 238 void ?{}(future_t & this) { 237 this.ptr = 0p;239 this.ptr = future_ARMED; 238 240 } 239 241 … … 242 244 void reset(future_t & this) { 243 245 // needs to be in 0p or 1p 244 __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);246 __atomic_exchange_n( &this.ptr, future_ARMED, __ATOMIC_SEQ_CST); 245 247 } 246 248 247 249 // check if the future is available 248 250 bool available( future_t & this ) { 249 while( this.ptr == 2p) Pause();250 return this.ptr == 1p;251 while( this.ptr == future_PROGRESS ) Pause(); 252 return this.ptr == future_FULFILLED; 251 253 } 252 254 … … 254 256 // intented to be use by wait, wait_any, waitfor, etc. rather than used directly 255 257 bool setup( future_t & this, oneshot & wait_ctx ) { 256 /* paranoid */ verify( wait_ctx.ptr == 0p || wait_ctx.ptr == 1p);258 /* paranoid */ verify( wait_ctx.ptr == oneshot_ARMED || wait_ctx.ptr == oneshot_FULFILLED ); 257 259 // The future needs to set the wait context 258 260 for() { 259 261 struct oneshot * expected = this.ptr; 260 262 // Is the future already fulfilled? 261 if(expected == 1p) return false; // Yes, just return false (didn't block)263 if(expected == future_FULFILLED) return false; // Yes, just return false (didn't block) 262 264 263 265 // The future is not fulfilled, try to setup the wait context … … 277 279 278 280 // attempt to remove the context so it doesn't get consumed. 279 if(__atomic_compare_exchange_n( &this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {281 if(__atomic_compare_exchange_n( &this.ptr, &expected, future_ARMED, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { 280 282 // we still have the original context, then no one else saw it 281 283 return false; 282 284 } 283 285 284 // expected == 0p: future was never actually setup, just return285 if( expected == 0p) return false;286 287 // expected == 1p: the future is ready and the context was fully consumed286 // expected == ARMED: future was never actually setup, just return 287 if( expected == future_ARMED ) return false; 288 289 // expected == FULFILLED: the future is ready and the context was fully consumed 288 290 // the server won't use the pointer again 289 291 // It is safe to delete (which could happen after the return) 290 if( expected == 1p) return true;291 292 // expected == 2p: the future is ready but the context hasn't fully been consumed292 if( expected == future_FULFILLED ) return true; 293 294 // expected == PROGRESS: the future is ready but the context hasn't fully been consumed 293 295 // spin until it is safe to move on 294 if( expected == 2p) {295 while( this.ptr != 1p) Pause();296 /* paranoid */ verify( this.ptr == 1p);296 if( expected == future_PROGRESS ) { 297 while( this.ptr != future_FULFILLED ) Pause(); 298 /* paranoid */ verify( this.ptr == future_FULFILLED ); 297 299 return true; 298 300 } … … 305 307 // Mark the future as abandoned, meaning it will be deleted by the server 306 308 bool abandon( future_t & this ) { 307 /* paranoid */ verify( this.ptr != 3p);309 /* paranoid */ verify( this.ptr != future_ABANDONED ); 308 310 309 311 // Mark the future as abandonned 310 struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);312 struct oneshot * got = __atomic_exchange_n( &this.ptr, future_ABANDONED, __ATOMIC_SEQ_CST); 311 313 312 314 // If the future isn't already fulfilled, let the server delete it 313 if( got == 0p) return false;314 315 // got == 2p: the future is ready but the context hasn't fully been consumed315 if( got == future_ARMED ) return false; 316 317 // got == PROGRESS: the future is ready but the context hasn't fully been consumed 316 318 // spin until it is safe to move on 317 if( got == 2p) {318 while( this.ptr != 1p) Pause();319 got = 1p;319 if( got == future_PROGRESS ) { 320 while( this.ptr != future_FULFILLED ) Pause(); 321 got = future_FULFILLED; 320 322 } 321 323 322 324 // The future is completed delete it now 323 /* paranoid */ verify( this.ptr != 1p);325 /* paranoid */ verify( this.ptr != future_FULFILLED ); 324 326 free( &this ); 325 327 return true; … … 336 338 #pragma GCC diagnostic ignored "-Wfree-nonheap-object" 337 339 #endif 338 if( expected == 3p) { free( &this ); return 0p; }340 if( expected == future_ABANDONED ) { free( &this ); return 0p; } 339 341 #if defined(__GNUC__) && __GNUC__ >= 7 340 342 #pragma GCC diagnostic pop 341 343 #endif 342 344 343 /* paranoid */ verify( expected != 1p); // Future is already fulfilled, should not happen344 /* paranoid */ verify( expected != 2p); // Future is bein fulfilled by someone else, this is even less supported then the previous case.345 /* paranoid */ verify( expected != future_FULFILLED ); // Future is already fulfilled, should not happen 346 /* paranoid */ verify( expected != future_PROGRESS ); // Future is bein fulfilled by someone else, this is even less supported then the previous case. 345 347 346 348 // If there is a wait context, we need to consume it and mark it as consumed after 347 349 // If there is no context then we can skip the in progress phase 348 struct oneshot * want = expected == 0p ? 1p : 2p;350 struct oneshot * want = expected == future_ARMED ? future_FULFILLED : future_PROGRESS; 349 351 if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) { 350 if( expected == 0p) { return 0p; }352 if( expected == future_ARMED ) { return 0p; } 351 353 thread$ * ret = post( *expected, do_unpark ); 352 __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);354 __atomic_store_n( &this.ptr, future_FULFILLED, __ATOMIC_SEQ_CST); 353 355 return ret; 354 356 } … … 366 368 367 369 // Wait for the future to tru 368 while( this.ptr == 2p) Pause();370 while( this.ptr == future_PROGRESS ) Pause(); 369 371 // Make sure the state makes sense 370 372 // Should be fulfilled, could be in progress but it's out of date if so … … 372 374 // and the oneshot should not be needed any more 373 375 __attribute__((unused)) struct oneshot * was = this.ptr; 374 /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );376 /* paranoid */ verifyf( was == future_FULFILLED, "Expected this.ptr to be 1p, was %p\n", was ); 375 377 376 378 // Mark the future as fulfilled, to be consistent -
libcfa/src/concurrency/kernel/private.hfa
rebf8ca5 r23a08aa0 88 88 #elif defined(CFA_HAVE_LINUX_RSEQ_H) 89 89 extern "Cforall" { 90 extern __attribute__((aligned(64))) thread_localvolatile struct rseq __cfaabi_rseq;90 extern __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq; 91 91 } 92 92 #else … … 161 161 // Blocking acquire 162 162 static inline void __atomic_acquire(volatile bool * ll) { 163 /* paranoid */ verify( ! __preemption_enabled() ); 164 /* paranoid */ verify(ll); 165 163 166 while( __builtin_expect(__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST), false) ) { 164 167 while(__atomic_load_n(ll, (int)__ATOMIC_RELAXED)) … … 166 169 } 167 170 /* paranoid */ verify(*ll); 171 /* paranoid */ verify( ! __preemption_enabled() ); 168 172 } 169 173 170 174 // Non-Blocking acquire 171 175 static inline bool __atomic_try_acquire(volatile bool * ll) { 176 /* paranoid */ verify( ! __preemption_enabled() ); 177 /* paranoid */ verify(ll); 178 172 179 return !__atomic_exchange_n(ll, (bool)true, __ATOMIC_SEQ_CST); 173 180 } … … 175 182 // Release 176 183 static inline void __atomic_unlock(volatile bool * ll) { 184 /* paranoid */ verify( ! __preemption_enabled() ); 185 /* paranoid */ verify(ll); 177 186 /* paranoid */ verify(*ll); 178 187 __atomic_store_n(ll, (bool)false, __ATOMIC_RELEASE); -
libcfa/src/concurrency/kernel/startup.cfa
rebf8ca5 r23a08aa0 133 133 //----------------------------------------------------------------------------- 134 134 // Global state 135 thread_localstruct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= {135 __thread struct KernelThreadData __cfaabi_tls __attribute__ ((tls_model ( "initial-exec" ))) @= { 136 136 NULL, // cannot use 0p 137 137 NULL, … … 153 153 #elif defined(CFA_HAVE_LINUX_RSEQ_H) 154 154 extern "Cforall" { 155 __attribute__((aligned(64))) thread_localvolatile struct rseq __cfaabi_rseq @= {155 __attribute__((aligned(64))) __thread volatile struct rseq __cfaabi_rseq @= { 156 156 .cpu_id : RSEQ_CPU_ID_UNINITIALIZED, 157 157 }; -
libcfa/src/concurrency/preemption.cfa
rebf8ca5 r23a08aa0 238 238 //---------- 239 239 // special case for preemption since used often 240 __attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() libcfa_ public {240 __attribute__((optimize("no-reorder-blocks"))) bool __preemption_enabled() libcfa_nopreempt libcfa_public { 241 241 // create a assembler label before 242 242 // marked as clobber all to avoid movement … … 272 272 } 273 273 274 extern "C" { 275 __attribute__((visibility("hidden"))) extern void * const __start_cfatext_nopreempt; 276 __attribute__((visibility("hidden"))) extern void * const __stop_cfatext_nopreempt; 277 278 extern const __cfa_nopreempt_region __libcfa_nopreempt; 279 __attribute__((visibility("protected"))) const __cfa_nopreempt_region __libcfathrd_nopreempt @= { 280 (void * const)&__start_cfatext_nopreempt, 281 (void * const)&__stop_cfatext_nopreempt 282 }; 283 } 284 285 static inline bool __cfaabi_in( void * const ip, const struct __cfa_nopreempt_region & const region ) { 286 return ip >= region.start && ip <= region.stop; 287 } 288 274 289 275 290 //---------- 276 291 // Get data from the TLS block 277 292 // struct asm_region __cfaasm_get; 278 uintptr_t __cfatls_get( unsigned long int offset ) __attribute__((__noinline__, visibility("default"))); //no inline to avoid problems293 uintptr_t __cfatls_get( unsigned long int offset ) libcfa_nopreempt libcfa_public; //no inline to avoid problems 279 294 uintptr_t __cfatls_get( unsigned long int offset ) { 280 295 // create a assembler label before … … 295 310 extern "C" { 296 311 // Disable interrupts by incrementing the counter 297 __attribute__((__noinline__, visibility("default"))) void disable_interrupts()libcfa_public {312 void disable_interrupts() libcfa_nopreempt libcfa_public { 298 313 // create a assembler label before 299 314 // marked as clobber all to avoid movement … … 326 341 // Enable interrupts by decrementing the counter 327 342 // If counter reaches 0, execute any pending __cfactx_switch 328 void enable_interrupts( bool poll ) libcfa_ public {343 void enable_interrupts( bool poll ) libcfa_nopreempt libcfa_public { 329 344 // Cache the processor now since interrupts can start happening after the atomic store 330 345 processor * proc = __cfaabi_tls.this_processor; … … 358 373 } 359 374 } 375 376 // Check whether or not there is pending preemption 377 // force_yield( __POLL_PREEMPTION ) if appropriate 378 // return true if the thread was in an interruptable state 379 // i.e. on a real processor and not in the kernel 380 // (can return true even if no preemption was pending) 381 bool poll_interrupts() libcfa_public { 382 // Cache the processor now since interrupts can start happening after the atomic store 383 processor * proc = publicTLS_get( this_processor ); 384 if ( ! proc ) return false; 385 if ( ! __preemption_enabled() ) return false; 386 387 with( __cfaabi_tls.preemption_state ){ 388 // Signal the compiler that a fence is needed but only for signal handlers 389 __atomic_signal_fence(__ATOMIC_RELEASE); 390 if( proc->pending_preemption ) { 391 proc->pending_preemption = false; 392 force_yield( __POLL_PREEMPTION ); 393 } 394 } 395 396 return true; 397 } 360 398 } 361 399 … … 463 501 464 502 //----------------------------------------------------------------------------- 465 // Some assembly required466 #if defined( __i386 )467 #ifdef __PIC__468 #define RELOC_PRELUDE( label ) \469 "calll .Lcfaasm_prelude_" #label "$pb\n\t" \470 ".Lcfaasm_prelude_" #label "$pb:\n\t" \471 "popl %%eax\n\t" \472 ".Lcfaasm_prelude_" #label "_end:\n\t" \473 "addl $_GLOBAL_OFFSET_TABLE_+(.Lcfaasm_prelude_" #label "_end-.Lcfaasm_prelude_" #label "$pb), %%eax\n\t"474 #define RELOC_PREFIX ""475 #define RELOC_SUFFIX "@GOT(%%eax)"476 #else477 #define RELOC_PREFIX "$"478 #define RELOC_SUFFIX ""479 #endif480 #define __cfaasm_label( label ) struct asm_region label = \481 ({ \482 struct asm_region region; \483 asm( \484 RELOC_PRELUDE( label ) \485 "movl " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \486 "movl " RELOC_PREFIX "__cfaasm_" #label "_after" RELOC_SUFFIX ", %[va]\n\t" \487 : [vb]"=r"(region.before), [va]"=r"(region.after) \488 ); \489 region; \490 });491 #elif defined( __x86_64 )492 #ifdef __PIC__493 #define RELOC_PREFIX ""494 #define RELOC_SUFFIX "@GOTPCREL(%%rip)"495 #else496 #define RELOC_PREFIX "$"497 #define RELOC_SUFFIX ""498 #endif499 #define __cfaasm_label( label ) struct asm_region label = \500 ({ \501 struct asm_region region; \502 asm( \503 "movq " RELOC_PREFIX "__cfaasm_" #label "_before" RELOC_SUFFIX ", %[vb]\n\t" \504 "movq " RELOC_PREFIX "__cfaasm_" #label "_after" RELOC_SUFFIX ", %[va]\n\t" \505 : [vb]"=r"(region.before), [va]"=r"(region.after) \506 ); \507 region; \508 });509 #elif defined( __aarch64__ )510 #ifdef __PIC__511 // Note that this works only for gcc512 #define __cfaasm_label( label ) struct asm_region label = \513 ({ \514 struct asm_region region; \515 asm( \516 "adrp %[vb], _GLOBAL_OFFSET_TABLE_" "\n\t" \517 "ldr %[vb], [%[vb], #:gotpage_lo15:__cfaasm_" #label "_before]" "\n\t" \518 "adrp %[va], _GLOBAL_OFFSET_TABLE_" "\n\t" \519 "ldr %[va], [%[va], #:gotpage_lo15:__cfaasm_" #label "_after]" "\n\t" \520 : [vb]"=r"(region.before), [va]"=r"(region.after) \521 ); \522 region; \523 });524 #else525 #error this is not the right thing to do526 /*527 #define __cfaasm_label( label ) struct asm_region label = \528 ({ \529 struct asm_region region; \530 asm( \531 "adrp %[vb], __cfaasm_" #label "_before" "\n\t" \532 "add %[vb], %[vb], :lo12:__cfaasm_" #label "_before" "\n\t" \533 "adrp %[va], :got:__cfaasm_" #label "_after" "\n\t" \534 "add %[va], %[va], :lo12:__cfaasm_" #label "_after" "\n\t" \535 : [vb]"=r"(region.before), [va]"=r"(region.after) \536 ); \537 region; \538 });539 */540 #endif541 #else542 #error unknown hardware architecture543 #endif544 545 503 // KERNEL ONLY 546 504 // Check if a __cfactx_switch signal handler shoud defer … … 548 506 // If false : preemption is unsafe and marked as pending 549 507 static inline bool preemption_ready( void * ip ) { 550 // Get all the region for which it is not safe to preempt551 __cfaasm_label( get );552 __cfaasm_label( check );553 __cfaasm_label( dsable );554 // __cfaasm_label( debug );555 556 508 // Check if preemption is safe 557 509 bool ready = true; 558 if( __cfaasm_in( ip, get ) ) { ready = false; goto EXIT; }; 559 if( __cfaasm_in( ip, check ) ) { ready = false; goto EXIT; }; 560 if( __cfaasm_in( ip, dsable ) ) { ready = false; goto EXIT; }; 561 // if( __cfaasm_in( ip, debug ) ) { ready = false; goto EXIT; }; 510 if( __cfaabi_in( ip, __libcfa_nopreempt ) ) { ready = false; goto EXIT; }; 511 if( __cfaabi_in( ip, __libcfathrd_nopreempt ) ) { ready = false; goto EXIT; }; 512 562 513 if( !__cfaabi_tls.preemption_state.enabled) { ready = false; goto EXIT; }; 563 514 if( __cfaabi_tls.preemption_state.in_progress ) { ready = false; goto EXIT; }; … … 643 594 // Kernel Signal Handlers 644 595 //============================================================================================= 645 __cfaabi_dbg_debug_do( static thread_localvoid * last_interrupt = 0; )596 __cfaabi_dbg_debug_do( static __thread void * last_interrupt = 0; ) 646 597 647 598 // Context switch signal handler -
libcfa/src/startup.cfa
rebf8ca5 r23a08aa0 41 41 } // __cfaabi_appready_shutdown 42 42 43 void disable_interrupts() __attribute__(( weak )) libcfa_public {} 44 void enable_interrupts() __attribute__(( weak )) libcfa_public {} 43 void disable_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public {} 44 void enable_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public {} 45 bool poll_interrupts() __attribute__(( weak )) libcfa_nopreempt libcfa_public { return false; } 46 47 __attribute__((visibility("hidden"))) extern void * const __start_cfatext_nopreempt; 48 __attribute__((visibility("hidden"))) extern void * const __stop_cfatext_nopreempt; 49 50 __attribute__((visibility("protected"))) const __cfa_nopreempt_region __libcfa_nopreempt @= { 51 (void * const)&__start_cfatext_nopreempt, 52 (void * const)&__stop_cfatext_nopreempt 53 }; 45 54 46 55 -
src/AST/Decl.hpp
rebf8ca5 r23a08aa0 217 217 218 218 /// convenience accessor to match Type::isComplete() 219 bool isComplete() { return sized; }219 bool isComplete() const { return sized; } 220 220 221 221 const Decl * accept( Visitor & v ) const override { return v.visit( this ); } -
src/AST/DeclReplacer.cpp
rebf8ca5 r23a08aa0 9 9 // Author : Aaron B. Moss 10 10 // Created On : Wed May 8 13:00:00 2019 11 // Last Modified By : A aron B. Moss12 // Last Modified On : Wed May 8 13:00:00 201913 // Update Count : 111 // Last Modified By : Andrew Beach 12 // Last Modified On : Thr Sep 15 11:55:00 2022 13 // Update Count : 2 14 14 // 15 15 16 16 #include "DeclReplacer.hpp" 17 17 18 #include "Expr.hpp" 19 #include "Pass.hpp" 18 20 #include "Type.hpp" 19 20 #include "Pass.hpp"21 21 22 22 namespace ast { 23 23 24 24 namespace DeclReplacer { 25 namespace {26 struct DeclReplacer {27 private:28 const DeclMap & declMap;29 const TypeMap & typeMap;30 bool debug;31 25 32 public: 33 DeclReplacer(const DeclMap & declMap, const TypeMap & typeMap, bool debug) 34 : declMap( declMap ), typeMap( typeMap ), debug( debug ) 35 {} 26 namespace { 27 struct DeclReplacer { 28 private: 29 const DeclMap & declMap; 30 const TypeMap & typeMap; 31 bool debug; 36 32 37 const ast::VariableExpr * previsit( const ast::VariableExpr * ); 38 const ast::TypeInstType * previsit( const ast::TypeInstType * ); 39 }; 33 public: 34 DeclReplacer( const DeclMap & declMap, const TypeMap & typeMap, bool debug ) 35 : declMap( declMap ), typeMap( typeMap ), debug( debug ) 36 {} 40 37 41 struct VarExprReplacer { 42 private: 43 const ExprMap & exprMap; 44 45 public: 46 VarExprReplacer(const ExprMap & exprMap): exprMap (exprMap) {} 38 const ast::VariableExpr * previsit( const ast::VariableExpr * ); 39 const ast::TypeInstType * previsit( const ast::TypeInstType * ); 40 }; 47 41 48 const Expr * postvisit (const VariableExpr *); 49 }; 42 struct VarExprReplacer { 43 private: 44 const ExprMap & exprMap; 45 46 public: 47 VarExprReplacer( const ExprMap & exprMap ) : exprMap( exprMap ) {} 48 49 const Expr * postvisit( const VariableExpr * ); 50 }; 51 } // namespace 52 53 const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, const TypeMap & typeMap, bool debug ) { 54 if(!node) return nullptr; 55 Pass<DeclReplacer> replacer = { declMap, typeMap, debug }; 56 return node->accept( replacer ); 57 } 58 59 const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, bool debug ) { 60 TypeMap typeMap; 61 return replace( node, declMap, typeMap, debug ); 62 } 63 64 const ast::Node * replace( const ast::Node * node, const TypeMap & typeMap, bool debug ) { 65 DeclMap declMap; 66 return replace( node, declMap, typeMap, debug ); 67 } 68 69 const ast::Node * replace( const ast::Node * node, const ExprMap & exprMap ) { 70 Pass<VarExprReplacer> replacer = {exprMap}; 71 return node->accept( replacer ); 72 } 73 74 namespace { 75 // replace variable with new node from decl map 76 const ast::VariableExpr * DeclReplacer::previsit( const VariableExpr * varExpr ) { 77 // xxx - assertions and parameters aren't accounted for in this... (i.e. they aren't inserted into the map when it's made, only DeclStmts are) 78 if ( !declMap.count( varExpr->var ) ) return varExpr; 79 80 auto replacement = declMap.at( varExpr->var ); 81 if ( debug ) { 82 std::cerr << "replacing variable reference: " 83 << (void*)varExpr->var.get() << " " << varExpr->var 84 << " with " << (void*)replacement << " " << replacement 85 << std::endl; 86 } 87 auto nexpr = mutate(varExpr); 88 nexpr->var = replacement; 89 return nexpr; 50 90 } 51 91 52 const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, const TypeMap & typeMap, bool debug ) { 53 if(!node) return nullptr; 54 Pass<DeclReplacer> replacer = { declMap, typeMap, debug }; 55 return node->accept( replacer ); 92 const TypeInstType * DeclReplacer::previsit( const TypeInstType * inst ) { 93 if ( !typeMap.count( inst->base ) ) return inst; 94 95 auto replacement = typeMap.at( inst->base ); 96 if ( debug ) { 97 std::cerr << "replacing type reference: " 98 << (void*)inst->base.get() << " " << inst->base 99 << " with " << (void*)replacement << " " << replacement 100 << std::endl; 101 } 102 auto ninst = mutate(inst); 103 ninst->base = replacement; 104 return ninst; 56 105 } 57 106 58 const ast::Node * replace( const ast::Node * node, const DeclMap & declMap, bool debug) {59 TypeMap typeMap;60 return replace( node, declMap, typeMap, debug);107 const Expr * VarExprReplacer::postvisit( const VariableExpr * expr ) { 108 if ( !exprMap.count( expr->var ) ) return expr; 109 return exprMap.at( expr->var ); 61 110 } 111 } // namespace 62 112 63 const ast::Node * replace( const ast::Node * node, const TypeMap & typeMap, bool debug ) { 64 DeclMap declMap; 65 return replace( node, declMap, typeMap, debug ); 66 } 113 } // namespace DeclReplacer 67 114 68 const ast::Node * replace( const ast::Node * node, const ExprMap & exprMap) { 69 Pass<VarExprReplacer> replacer = {exprMap}; 70 return node->accept( replacer ); 71 } 72 73 namespace { 74 // replace variable with new node from decl map 75 const ast::VariableExpr * DeclReplacer::previsit( const VariableExpr * varExpr ) { 76 // xxx - assertions and parameters aren't accounted for in this... (i.e. they aren't inserted into the map when it's made, only DeclStmts are) 77 if ( !declMap.count( varExpr->var ) ) return varExpr; 78 79 auto replacement = declMap.at( varExpr->var ); 80 if ( debug ) { 81 std::cerr << "replacing variable reference: " 82 << (void*)varExpr->var.get() << " " << varExpr->var 83 << " with " << (void*)replacement << " " << replacement 84 << std::endl; 85 } 86 auto nexpr = mutate(varExpr); 87 nexpr->var = replacement; 88 return nexpr; 89 } 90 91 const TypeInstType * DeclReplacer::previsit( const TypeInstType * inst ) { 92 if ( !typeMap.count( inst->base ) ) return inst; 93 94 auto replacement = typeMap.at( inst->base ); 95 if ( debug ) { 96 std::cerr << "replacing type reference: " 97 << (void*)inst->base.get() << " " << inst->base 98 << " with " << (void*)replacement << " " << replacement 99 << std::endl; 100 } 101 auto ninst = mutate(inst); 102 ninst->base = replacement; 103 return ninst; 104 } 105 106 const Expr * VarExprReplacer::postvisit( const VariableExpr * expr ) { 107 if (!exprMap.count(expr->var)) return expr; 108 109 return exprMap.at(expr->var); 110 } 111 112 } 113 } 114 115 } 115 } // namespace ast 116 116 117 117 // Local Variables: // -
src/AST/Pass.hpp
rebf8ca5 r23a08aa0 327 327 struct PureVisitor {}; 328 328 329 struct WithCodeLocation { 330 const CodeLocation * location = nullptr; 331 }; 332 329 333 /// Keep track of the polymorphic const TypeSubstitution * typeSubs for the current expression. 330 334 struct WithConstTypeSubstitution { -
src/AST/Pass.impl.hpp
rebf8ca5 r23a08aa0 25 25 #define VISIT_START( node ) \ 26 26 using namespace ast; \ 27 /* back-up the last known code location */ \ 28 __attribute__((unused)) auto loc_guard = ast::__pass::make_location_guard( core, node, 0 ); \ 27 29 /* back-up the visit children */ \ 28 30 __attribute__((unused)) ast::__pass::visit_children_guard guard1( ast::__pass::visit_children(core, 0) ); \ -
src/AST/Pass.proto.hpp
rebf8ca5 r23a08aa0 326 326 } 327 327 328 template< typename core_t, typename node_t > 329 static auto make_location_guard( core_t & core, node_t * node, int ) 330 -> decltype( node->location, ValueGuardPtr<const CodeLocation *>( &core.location ) ) { 331 ValueGuardPtr<const CodeLocation *> guard( &core.location ); 332 core.location = &node->location; 333 return guard; 334 } 335 336 template< typename core_t, typename node_t > 337 static auto make_location_guard( core_t &, node_t *, long ) -> int { 338 return 0; 339 } 340 328 341 // Another feature of the templated visitor is that it calls beginScope()/endScope() for compound statement. 329 342 // All passes which have such functions are assumed desire this behaviour -
src/AST/Print.cpp
rebf8ca5 r23a08aa0 33 33 { 34 34 return array<C,sizeof...(T)>{ 35 forward<T>(values)...35 std::forward<T>(values)... 36 36 }; 37 37 } … … 86 86 87 87 static constexpr auto StorageClasses = make_array<const char*>( 88 "extern", "static", "auto", "register", "_ Thread_local"88 "extern", "static", "auto", "register", "__thread", "_Thread_local" 89 89 ); 90 90 … … 215 215 ++indent; 216 216 ptrToEnum->base->accept( *this ); 217 --indent; 217 --indent; 218 218 } 219 219 … … 1623 1623 // if the wrong size is specified 1624 1624 constexpr array<const char*, 3> Printer::Names::FuncSpecifiers; 1625 constexpr array<const char*, 5> Printer::Names::StorageClasses;1625 constexpr array<const char*, 6> Printer::Names::StorageClasses; 1626 1626 constexpr array<const char*, 6> Printer::Names::Qualifiers; 1627 1627 } -
src/AST/StorageClasses.hpp
rebf8ca5 r23a08aa0 24 24 /// Bitflags for storage classes 25 25 enum { 26 Extern = 1 << 0, 27 Static = 1 << 1, 28 Auto = 1 << 2, 29 Register = 1 << 3, 30 ThreadLocal = 1 << 4, 31 NumClasses = 5 26 Extern = 1 << 0, 27 Static = 1 << 1, 28 Auto = 1 << 2, 29 Register = 1 << 3, 30 ThreadLocalGcc = 1 << 4, 31 ThreadLocalC11 = 1 << 5, 32 NumClasses = 6 32 33 }; 33 34 … … 37 38 unsigned int val; 38 39 struct { 39 bool is_extern : 1; 40 bool is_static : 1; 41 bool is_auto : 1; 42 bool is_register : 1; 43 bool is_threadlocal : 1; 40 bool is_extern : 1; 41 bool is_static : 1; 42 bool is_auto : 1; 43 bool is_register : 1; 44 bool is_threadlocalGcc : 1; 45 bool is_threadlocalC11 : 1; 44 46 }; 45 47 … … 48 50 49 51 constexpr class_flags( unsigned int val = 0 ) : val(val) {} 52 53 bool is_threadlocal_any() { return this->is_threadlocalC11 || this->is_threadlocalGcc; } 50 54 }; 51 55 -
src/AST/Type.cpp
rebf8ca5 r23a08aa0 143 143 TraitInstType::TraitInstType( 144 144 const TraitDecl * b, CV::Qualifiers q, std::vector<ptr<Attribute>>&& as ) 145 : BaseInstType( b->name, q, move(as) ), base( b ) {}145 : BaseInstType( b->name, q, std::move(as) ), base( b ) {} 146 146 147 147 // --- TypeInstType … … 149 149 TypeInstType::TypeInstType( const TypeDecl * b, 150 150 CV::Qualifiers q, std::vector<ptr<Attribute>> && as ) 151 : BaseInstType( b->name, q, move(as) ), base( b ), kind( b->kind ) {}151 : BaseInstType( b->name, q, std::move(as) ), base( b ), kind( b->kind ) {} 152 152 153 153 void TypeInstType::set_base( const TypeDecl * b ) { … … 161 161 162 162 TupleType::TupleType( std::vector<ptr<Type>> && ts, CV::Qualifiers q ) 163 : Type( q ), types( move(ts) ), members() {163 : Type( q ), types( std::move(ts) ), members() { 164 164 // This constructor is awkward. `TupleType` needs to contain objects so that members can be 165 165 // named, but members without initializer nodes end up getting constructors, which breaks -
src/AST/Type.hpp
rebf8ca5 r23a08aa0 83 83 template< enum Node::ref_type ref_t > 84 84 void reset_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q = {} ) { 85 if ( p->qualifiers .val != q.val) p.get_and_mutate()->qualifiers = q;85 if ( p->qualifiers != q ) p.get_and_mutate()->qualifiers = q; 86 86 } 87 87 … … 89 89 template< enum Node::ref_type ref_t > 90 90 void add_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q ) { 91 if ( ( p->qualifiers .val & q.val ) != q.val) p.get_and_mutate()->qualifiers |= q;91 if ( ( p->qualifiers & q ) != q ) p.get_and_mutate()->qualifiers |= q; 92 92 } 93 93 … … 95 95 template< enum Node::ref_type ref_t > 96 96 void remove_qualifiers( ptr_base< Type, ref_t > & p, CV::Qualifiers q ) { 97 if ( ( p->qualifiers .val & q.val) != 0 ) p.get_and_mutate()->qualifiers -= q;97 if ( ( p->qualifiers & q ) != 0 ) p.get_and_mutate()->qualifiers -= q; 98 98 } 99 99 … … 412 412 std::string typeString() const { return std::string("_") + std::to_string(formal_usage) + "_" + std::to_string(expr_id) + "_" + base->name; } 413 413 bool operator==(const TypeEnvKey & other) const { return base == other.base && formal_usage == other.formal_usage && expr_id == other.expr_id; } 414 415 414 }; 416 415 -
src/CodeGen/CodeGenerator.cc
rebf8ca5 r23a08aa0 493 493 assert( false ); 494 494 } // switch 495 } else if( varExpr->get_var()->get_linkage() == LinkageSpec::BuiltinCFA && varExpr->get_var()->get_name() == "intptr" ) { 496 // THIS is a hack to make it a constant until a proper constexpr solution is created 497 output << "((void*)"; 498 std::list< Expression* >::iterator arg = applicationExpr->get_args().begin(); 499 (*arg++)->accept( *visitor ); 500 output << ")"; 495 501 } else { 496 502 varExpr->accept( *visitor ); -
src/Common/utility.h
rebf8ca5 r23a08aa0 322 322 323 323 ValueGuardPtr(T * inRef) : old( inRef ? *inRef : T() ), ref(inRef) {} 324 ValueGuardPtr(const ValueGuardPtr& other) = delete; 325 ValueGuardPtr(ValueGuardPtr&& other) : old(other.old), ref(other.ref) { other.ref = nullptr; } 324 326 ~ValueGuardPtr() { if( ref ) *ref = old; } 325 327 }; -
src/CompilationState.cc
rebf8ca5 r23a08aa0 31 31 genproto = false, 32 32 deterministic_output = false, 33 useNewAST = CFA_USE_NEW_AST,33 useNewAST = true, 34 34 nomainp = false, 35 35 parsep = false, -
src/Concurrency/Keywords.cc
rebf8ca5 r23a08aa0 508 508 ObjectDecl * vtable_object = Virtual::makeVtableForward( 509 509 "_default_vtable_object_declaration", 510 vtable_decl->makeInst( move( poly_args ) ) );510 vtable_decl->makeInst( std::move( poly_args ) ) ); 511 511 declsToAddBefore.push_back( vtable_object ); 512 512 declsToAddBefore.push_back( … … 681 681 void lock (monitor_t & this) { 682 682 lock(get_monitor(this)); 683 } 683 } 684 684 */ 685 685 FunctionDecl * lock_decl = new FunctionDecl( … … 700 700 CompoundStmt * lock_statement = new CompoundStmt(); 701 701 lock_statement->push_back( 702 new ExprStmt( 702 new ExprStmt( 703 703 new UntypedExpr ( 704 704 new NameExpr( "lock" ), … … 716 716 void unlock (monitor_t & this) { 717 717 unlock(get_monitor(this)); 718 } 718 } 719 719 */ 720 720 FunctionDecl * unlock_decl = new FunctionDecl( … … 736 736 737 737 unlock_statement->push_back( 738 new ExprStmt( 738 new ExprStmt( 739 739 new UntypedExpr( 740 740 new NameExpr( "unlock" ), … … 746 746 ); 747 747 unlock_decl->set_statements( unlock_statement ); 748 748 749 749 // pushes routines to declsToAddAfter to add at a later time 750 750 declsToAddAfter.push_back( lock_decl ); … … 1054 1054 assert( !thread_guard_decl ); 1055 1055 thread_guard_decl = decl; 1056 } 1056 } 1057 1057 else if ( decl->name == "__mutex_stmt_lock_guard" && decl->body ) { 1058 1058 assert( !lock_guard_decl ); … … 1206 1206 new NameExpr( "__get_mutexstmt_lock_type" ), 1207 1207 { args.front()->clone() } 1208 ) 1208 ) 1209 1209 ) 1210 1210 ), … … 1225 1225 1226 1226 StructInstType * lock_guard_struct = new StructInstType( noQualifiers, lock_guard_decl ); 1227 TypeExpr * lock_type_expr = new TypeExpr( 1227 TypeExpr * lock_type_expr = new TypeExpr( 1228 1228 new TypeofType( noQualifiers, new UntypedExpr( 1229 1229 new NameExpr( "__get_mutexstmt_lock_type" ), 1230 1230 { args.front()->clone() } 1231 ) 1232 ) 1231 ) 1232 ) 1233 1233 ); 1234 1234 -
src/Concurrency/Waitfor.cc
rebf8ca5 r23a08aa0 402 402 403 403 clause.target.function = nullptr; 404 clause.target.arguments. empty();404 clause.target.arguments.clear(); 405 405 clause.condition = nullptr; 406 406 } -
src/Concurrency/WaitforNew.cpp
rebf8ca5 r23a08aa0 101 101 namespace { 102 102 103 class GenerateWaitForCore :103 class GenerateWaitForCore final : 104 104 public ast::WithSymbolTable, public ast::WithConstTranslationUnit { 105 105 const ast::FunctionDecl * decl_waitfor = nullptr; -
src/ControlStruct/ExceptTranslateNew.cpp
rebf8ca5 r23a08aa0 32 32 } 33 33 34 class TranslateThrowsCore : public ast::WithGuards {34 class TranslateThrowsCore final : public ast::WithGuards { 35 35 const ast::ObjectDecl * terminateHandlerExcept; 36 36 enum Context { NoHandler, TerHandler, ResHandler } currentContext; … … 136 136 137 137 138 class TryMutatorCore {138 class TryMutatorCore final { 139 139 // The built in types used in translation. 140 140 const ast::StructDecl * except_decl; -
src/ControlStruct/LabelFixer.cc
rebf8ca5 r23a08aa0 119 119 120 120 // Builds a table that maps a label to its defining statement. 121 std::map<Label, Statement * > * LabelFixer::resolveJumps() throw ( SemanticErrorException ){121 std::map<Label, Statement * > * LabelFixer::resolveJumps() { 122 122 std::map< Label, Statement * > *ret = new std::map< Label, Statement * >(); 123 123 for ( std::map< Label, Entry * >::iterator i = labelTable.begin(); i != labelTable.end(); ++i ) { -
src/ControlStruct/LabelFixer.h
rebf8ca5 r23a08aa0 33 33 LabelFixer( LabelGenerator *gen = 0 ); 34 34 35 std::map < Label, Statement * > *resolveJumps() throw ( SemanticErrorException );35 std::map < Label, Statement * > *resolveJumps(); 36 36 37 37 // Declarations -
src/ControlStruct/MLEMutator.cc
rebf8ca5 r23a08aa0 141 141 142 142 143 Statement *MultiLevelExitMutator::postmutate( BranchStmt *branchStmt ) 144 throw ( SemanticErrorException ) { 143 Statement *MultiLevelExitMutator::postmutate( BranchStmt *branchStmt ) { 145 144 std::string originalTarget = branchStmt->originalTarget; 146 145 -
src/ControlStruct/MLEMutator.h
rebf8ca5 r23a08aa0 41 41 42 42 void premutate( CompoundStmt *cmpndStmt ); 43 Statement * postmutate( BranchStmt *branchStmt ) throw ( SemanticErrorException );43 Statement * postmutate( BranchStmt *branchStmt ); 44 44 void premutate( WhileDoStmt *whileDoStmt ); 45 45 Statement * postmutate( WhileDoStmt *whileDoStmt ); -
src/GenPoly/GenPoly.cc
rebf8ca5 r23a08aa0 9 9 // Author : Richard C. Bilson 10 10 // Created On : Mon May 18 07:44:20 2015 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Wed Jun 29 21:45:53 201613 // Update Count : 1 411 // Last Modified By : Andrew Beach 12 // Last Modified On : Wed Sep 14 9:24:00 2022 13 // Update Count : 15 14 14 // 15 15 … … 83 83 } 84 84 85 bool hasDynParams( const std::vector<ast::ptr<ast::Expr>> & params, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs ) { 86 for ( ast::ptr<ast::Expr> const & param : params ) { 87 auto paramType = param.as<ast::TypeExpr>(); 88 assertf( paramType, "Aggregate parameters should be type expressions." ); 89 if ( isDynType( paramType->type, tyVars, typeSubs ) ) { 90 return true; 91 } 92 } 93 return false; 94 } 95 85 96 /// Checks a parameter list for inclusion of polymorphic parameters; will substitute according to env if present 86 97 bool includesPolyParams( std::list< Expression* >& params, const TypeSubstitution *env ) { … … 198 209 } 199 210 return 0; 211 } 212 213 const ast::BaseInstType *isDynType( const ast::Type *type, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs ) { 214 type = replaceTypeInst( type, typeSubs ); 215 216 if ( auto inst = dynamic_cast<ast::TypeInstType const *>( type ) ) { 217 auto var = tyVars.find( inst->name ); 218 if ( var != tyVars.end() && var->second.isComplete ) { 219 return inst; 220 } 221 } else if ( auto inst = dynamic_cast<ast::StructInstType const *>( type ) ) { 222 if ( hasDynParams( inst->params, tyVars, typeSubs ) ) { 223 return inst; 224 } 225 } else if ( auto inst = dynamic_cast<ast::UnionInstType const *>( type ) ) { 226 if ( hasDynParams( inst->params, tyVars, typeSubs ) ) { 227 return inst; 228 } 229 } 230 return nullptr; 200 231 } 201 232 … … 378 409 inline D* as( B* p ) { return reinterpret_cast<D*>(p); } 379 410 411 template<typename D, typename B> 412 inline D const * as( B const * p ) { 413 return reinterpret_cast<D const *>( p ); 414 } 415 380 416 /// Flattens a declaration list 381 417 template<typename Output> … … 391 427 for ( Type* ty : src ) { 392 428 ResolvExpr::flatten( ty, out ); 429 } 430 } 431 432 void flattenList( vector<ast::ptr<ast::Type>> const & src, 433 vector<ast::ptr<ast::Type>> & out ) { 434 for ( auto const & type : src ) { 435 ResolvExpr::flatten( type, out ); 393 436 } 394 437 } … … 409 452 // if ( is<VoidType>( aparam->get_type() ) || is<VoidType>( bparam->get_type() ) ) continue; 410 453 if ( ! typesPolyCompatible( aparam->get_type(), bparam->get_type() ) ) return false; 454 } 455 456 return true; 457 } 458 459 bool paramListsPolyCompatible( 460 std::vector<ast::ptr<ast::Expr>> const & lparams, 461 std::vector<ast::ptr<ast::Expr>> const & rparams ) { 462 if ( lparams.size() != rparams.size() ) { 463 return false; 464 } 465 466 for ( auto lparam = lparams.begin(), rparam = rparams.begin() ; 467 lparam != lparams.end() ; ++lparam, ++rparam ) { 468 ast::TypeExpr const * lexpr = lparam->as<ast::TypeExpr>(); 469 assertf( lexpr, "Aggregate parameters should be type expressions" ); 470 ast::TypeExpr const * rexpr = rparam->as<ast::TypeExpr>(); 471 assertf( rexpr, "Aggregate parameters should be type expressions" ); 472 473 // xxx - might need to let VoidType be a wildcard here too; could have some voids 474 // stuffed in for dtype-statics. 475 // if ( is<VoidType>( lexpr->type() ) || is<VoidType>( bparam->get_type() ) ) continue; 476 if ( !typesPolyCompatible( lexpr->type, rexpr->type ) ) { 477 return false; 478 } 411 479 } 412 480 … … 505 573 } 506 574 575 bool typesPolyCompatible( ast::Type const * lhs, ast::Type const * rhs ) { 576 type_index const lid = typeid(*lhs); 577 578 // Polymorphic types always match: 579 if ( type_index(typeid(ast::TypeInstType)) == lid ) return true; 580 581 type_index const rid = typeid(*rhs); 582 if ( type_index(typeid(ast::TypeInstType)) == rid ) return true; 583 584 // All other types only match if they are the same type: 585 if ( lid != rid ) return false; 586 587 // So remaining types can be examined case by case. 588 // Recurse through type structure (conditions borrowed from Unify.cc). 589 590 if ( type_index(typeid(ast::BasicType)) == lid ) { 591 return as<ast::BasicType>(lhs)->kind == as<ast::BasicType>(rhs)->kind; 592 } else if ( type_index(typeid(ast::PointerType)) == lid ) { 593 ast::PointerType const * l = as<ast::PointerType>(lhs); 594 ast::PointerType const * r = as<ast::PointerType>(rhs); 595 596 // void pointers should match any other pointer type. 597 return is<ast::VoidType>( l->base.get() ) 598 || is<ast::VoidType>( r->base.get() ) 599 || typesPolyCompatible( l->base.get(), r->base.get() ); 600 } else if ( type_index(typeid(ast::ReferenceType)) == lid ) { 601 ast::ReferenceType const * l = as<ast::ReferenceType>(lhs); 602 ast::ReferenceType const * r = as<ast::ReferenceType>(rhs); 603 604 // void references should match any other reference type. 605 return is<ast::VoidType>( l->base.get() ) 606 || is<ast::VoidType>( r->base.get() ) 607 || typesPolyCompatible( l->base.get(), r->base.get() ); 608 } else if ( type_index(typeid(ast::ArrayType)) == lid ) { 609 ast::ArrayType const * l = as<ast::ArrayType>(lhs); 610 ast::ArrayType const * r = as<ast::ArrayType>(rhs); 611 612 if ( l->isVarLen ) { 613 if ( !r->isVarLen ) return false; 614 } else { 615 if ( r->isVarLen ) return false; 616 617 auto lc = l->dimension.as<ast::ConstantExpr>(); 618 auto rc = r->dimension.as<ast::ConstantExpr>(); 619 if ( lc && rc && lc->intValue() != rc->intValue() ) { 620 return false; 621 } 622 } 623 624 return typesPolyCompatible( l->base.get(), r->base.get() ); 625 } else if ( type_index(typeid(ast::FunctionType)) == lid ) { 626 ast::FunctionType const * l = as<ast::FunctionType>(lhs); 627 ast::FunctionType const * r = as<ast::FunctionType>(rhs); 628 629 std::vector<ast::ptr<ast::Type>> lparams, rparams; 630 flattenList( l->params, lparams ); 631 flattenList( r->params, rparams ); 632 if ( lparams.size() != rparams.size() ) return false; 633 for ( unsigned i = 0; i < lparams.size(); ++i ) { 634 if ( !typesPolyCompatible( lparams[i], rparams[i] ) ) return false; 635 } 636 637 std::vector<ast::ptr<ast::Type>> lrets, rrets; 638 flattenList( l->returns, lrets ); 639 flattenList( r->returns, rrets ); 640 if ( lrets.size() != rrets.size() ) return false; 641 for ( unsigned i = 0; i < lrets.size(); ++i ) { 642 if ( !typesPolyCompatible( lrets[i], rrets[i] ) ) return false; 643 } 644 return true; 645 } else if ( type_index(typeid(ast::StructInstType)) == lid ) { 646 ast::StructInstType const * l = as<ast::StructInstType>(lhs); 647 ast::StructInstType const * r = as<ast::StructInstType>(rhs); 648 649 if ( l->name != r->name ) return false; 650 return paramListsPolyCompatible( l->params, r->params ); 651 } else if ( type_index(typeid(ast::UnionInstType)) == lid ) { 652 ast::UnionInstType const * l = as<ast::UnionInstType>(lhs); 653 ast::UnionInstType const * r = as<ast::UnionInstType>(rhs); 654 655 if ( l->name != r->name ) return false; 656 return paramListsPolyCompatible( l->params, r->params ); 657 } else if ( type_index(typeid(ast::EnumInstType)) == lid ) { 658 ast::EnumInstType const * l = as<ast::EnumInstType>(lhs); 659 ast::EnumInstType const * r = as<ast::EnumInstType>(rhs); 660 661 return l->name == r->name; 662 } else if ( type_index(typeid(ast::TraitInstType)) == lid ) { 663 ast::TraitInstType const * l = as<ast::TraitInstType>(lhs); 664 ast::TraitInstType const * r = as<ast::TraitInstType>(rhs); 665 666 return l->name == r->name; 667 } else if ( type_index(typeid(ast::TupleType)) == lid ) { 668 ast::TupleType const * l = as<ast::TupleType>(lhs); 669 ast::TupleType const * r = as<ast::TupleType>(rhs); 670 671 std::vector<ast::ptr<ast::Type>> ltypes, rtypes; 672 flattenList( l->types, ( ltypes ) ); 673 flattenList( r->types, ( rtypes ) ); 674 if ( ltypes.size() != rtypes.size() ) return false; 675 676 for ( unsigned i = 0 ; i < ltypes.size() ; ++i ) { 677 if ( !typesPolyCompatible( ltypes[i], rtypes[i] ) ) return false; 678 } 679 return true; 680 // The remaining types (VoidType, VarArgsType, ZeroType & OneType) 681 // have no variation so will always be equal. 682 } else { 683 return true; 684 } 685 } 686 507 687 namespace { 508 688 // temporary hack to avoid re-implementing anything related to TyVarMap -
src/GenPoly/GenPoly.h
rebf8ca5 r23a08aa0 9 9 // Author : Richard C. Bilson 10 10 // Created On : Mon May 18 07:44:20 2015 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Sat Jul 22 09:22:57 201713 // Update Count : 711 // Last Modified By : Andrew Beach 12 // Last Modified On : Fri Aug 19 16:03:00 2022 13 // Update Count : 8 14 14 // 15 15 … … 27 27 namespace GenPoly { 28 28 29 // TODO Via some tricks this works for ast::TypeDecl::Data as well. 29 30 typedef ErasableScopedMap< std::string, TypeDecl::Data > TyVarMap; 31 30 32 /// Replaces a TypeInstType by its referrent in the environment, if applicable 31 33 Type* replaceTypeInst( Type* type, const TypeSubstitution* env ); … … 41 43 /// returns dynamic-layout type if is dynamic-layout type in tyVars, NULL otherwise; will look up substitution in env if provided 42 44 ReferenceToType *isDynType( Type *type, const TyVarMap &tyVars, const TypeSubstitution *env = 0 ); 45 const ast::BaseInstType *isDynType( const ast::Type *type, const TyVarMap &tyVars, const ast::TypeSubstitution *typeSubs = 0 ); 43 46 44 47 /// true iff function has dynamic-layout return type under the given type variable map … … 83 86 /// true iff types are structurally identical, where TypeInstType's match any type. 84 87 bool typesPolyCompatible( Type *aty, Type *bty ); 88 bool typesPolyCompatible( ast::Type const * lhs, ast::Type const * rhs ); 85 89 86 90 /// true if arg requires boxing given exprTyVars -
src/GenPoly/InstantiateGeneric.h
rebf8ca5 r23a08aa0 19 19 20 20 class Declaration; 21 namespace ast { 22 class TranslationUnit; 23 } 21 24 22 25 namespace GenPoly { 23 /// Replaces all generic types that have static layout with concrete instantiations. 24 /// Types with concrete values for otype parameters will be template-expanded, while 25 /// dtype and ftype parameters will be replaced by the appropriate void type. 26 void instantiateGeneric( std::list< Declaration* > &translationUnit ); 26 /// Replaces all generic types that have static layout with concrete 27 /// instantiations. Types with concrete values for otype parameters will be 28 /// template-expanded, while dtype and ftype parameters will be replaced by 29 /// the appropriate void type. 30 void instantiateGeneric( std::list< Declaration* > &translationUnit ); 31 void instantiateGeneric( ast::TranslationUnit & translationUnit ); 27 32 } // namespace GenPoly 28 33 -
src/GenPoly/Lvalue2.cc
rebf8ca5 r23a08aa0 23 23 } 24 24 25 26 25 } -
src/GenPoly/ScrubTyVars.cc
rebf8ca5 r23a08aa0 9 9 // Author : Richard C. Bilson 10 10 // Created On : Mon May 18 07:44:20 2015 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Thu Mar 16 15:44:27 201713 // Update Count : 311 // Last Modified By : Andrew Beach 12 // Last Modified On : Fri Aug 19 16:10:00 2022 13 // Update Count : 4 14 14 // 15 15 16 16 #include <utility> // for pair 17 17 18 #include "AST/Pass.hpp" 18 19 #include "GenPoly.h" // for mangleType, TyVarMap, alignof... 19 20 #include "GenPoly/ErasableScopedMap.h" // for ErasableScopedMap<>::const_it... 20 21 #include "ScrubTyVars.h" 22 #include "SymTab/Mangler.h" // for mangle, typeMode 21 23 #include "SynTree/Declaration.h" // for TypeDecl, TypeDecl::Data, Typ... 22 24 #include "SynTree/Expression.h" // for Expression (ptr only), NameExpr … … 112 114 return pointer; 113 115 } 116 117 namespace { 118 119 enum class ScrubMode { 120 FromMap, 121 DynamicFromMap, 122 All, 123 }; 124 125 struct ScrubTypeVars : 126 public ast::WithGuards, 127 public ast::WithShortCircuiting, 128 public ast::WithVisitorRef<ScrubTypeVars> { 129 130 ScrubTypeVars( ScrubMode m, TyVarMap const * tv ) : 131 mode ( m ), typeVars( tv ) {} 132 133 void previsit( ast::TypeInstType const * ) { visit_children = false; } 134 void previsit( ast::StructInstType const * ) { visit_children = false; } 135 void previsit( ast::UnionInstType const * ) { visit_children = false; } 136 void previsit( ast::SizeofExpr const * expr ) { primeBaseScrub( expr->type ); } 137 void previsit( ast::AlignofExpr const * expr ) { primeBaseScrub( expr->type ); } 138 void previsit( ast::PointerType const * type ) { primeBaseScrub( type->base ); } 139 140 ast::Type const * postvisit( ast::TypeInstType const * type ); 141 ast::Type const * postvisit( ast::StructInstType const * type ); 142 ast::Type const * postvisit( ast::UnionInstType const * type ); 143 ast::Expr const * postvisit( ast::SizeofExpr const * expr ); 144 ast::Expr const * postvisit( ast::AlignofExpr const * expr ); 145 ast::Type const * postvisit( ast::PointerType const * type ); 146 147 private: 148 ScrubMode const mode; 149 /// Type varriables to scrub. 150 TyVarMap const * const typeVars; 151 /// Value cached by primeBaseScrub. 152 ast::Type const * dynType = nullptr; 153 154 /// Returns the type if it should be scrubbed, nullptr otherwise. 155 ast::Type const * shouldScrub( ast::Type const * type ) { 156 switch ( mode ) { 157 case ScrubMode::FromMap: 158 return isPolyType( type, *typeVars ); 159 case ScrubMode::DynamicFromMap: 160 return isDynType( type, *typeVars ); 161 case ScrubMode::All: 162 return isPolyType( type ); 163 default: 164 assertf( false, "Invalid ScrubMode in shouldScrub." ); 165 throw; 166 } 167 } 168 169 void primeBaseScrub( ast::Type const * type ) { 170 // Need to determine whether type needs to be scrubbed to 171 // determine whether automatic recursion is necessary. 172 if ( ast::Type const * t = shouldScrub( type ) ) { 173 visit_children = false; 174 GuardValue( dynType ) = t; 175 } 176 } 177 178 ast::Type const * postvisitAggregateType( 179 ast::BaseInstType const * type ) { 180 if ( !shouldScrub( type ) ) return type; 181 return new ast::PointerType( new ast::VoidType( type->qualifiers ) ); 182 } 183 }; 184 185 ast::Type const * ScrubTypeVars::postvisit( ast::TypeInstType const * type ) { 186 // This implies that mode == ScrubMode::All. 187 if ( !typeVars ) { 188 if ( ast::TypeDecl::Ftype == type->kind ) { 189 return new ast::PointerType( 190 new ast::FunctionType( ast::FixedArgs ) ); 191 } else { 192 return new ast::PointerType( 193 new ast::VoidType( type->qualifiers ) ); 194 } 195 } 196 197 auto typeVar = typeVars->find( type->name ); 198 if ( typeVar == typeVars->end() ) { 199 return type; 200 } 201 202 switch ( typeVar->second.kind ) { 203 case ast::TypeDecl::Dtype: 204 case ast::TypeDecl::Ttype: 205 return new ast::PointerType( 206 new ast::VoidType( type->qualifiers ) ); 207 case ast::TypeDecl::Ftype: 208 return new ast::PointerType( 209 new ast::FunctionType( ast::VariableArgs ) ); 210 default: 211 assertf( false, 212 "Unhandled type variable kind: %d", typeVar->second.kind ); 213 throw; // Just in case the assert is removed, stop here. 214 } 215 } 216 217 ast::Type const * ScrubTypeVars::postvisit( ast::StructInstType const * type ) { 218 return postvisitAggregateType( type ); 219 } 220 221 ast::Type const * ScrubTypeVars::postvisit( ast::UnionInstType const * type ) { 222 return postvisitAggregateType( type ); 223 } 224 225 ast::Expr const * ScrubTypeVars::postvisit( ast::SizeofExpr const * expr ) { 226 // sizeof( T ) becomes the _sizeof_T parameter. 227 if ( dynType ) { 228 return new ast::NameExpr( expr->location, 229 sizeofName( Mangle::mangle( dynType, Mangle::typeMode() ) ) ); 230 } else { 231 return expr; 232 } 233 } 234 235 ast::Expr const * ScrubTypeVars::postvisit( ast::AlignofExpr const * expr ) { 236 // alignof( T ) becomes the _alignof_T parameter. 237 if ( dynType ) { 238 return new ast::NameExpr( expr->location, 239 alignofName( Mangle::mangle( dynType, Mangle::typeMode() ) ) ); 240 } else { 241 return expr; 242 } 243 } 244 245 ast::Type const * ScrubTypeVars::postvisit( ast::PointerType const * type ) { 246 if ( dynType ) { 247 ast::Type * ret = ast::mutate( dynType->accept( *visitor ) ); 248 ret->qualifiers |= type->qualifiers; 249 return ret; 250 } else { 251 return type; 252 } 253 } 254 255 const ast::Node * scrubTypeVarsBase( 256 const ast::Node * target, 257 ScrubMode mode, const TyVarMap * typeVars ) { 258 if ( ScrubMode::All == mode ) { 259 assert( nullptr == typeVars ); 260 } else { 261 assert( nullptr != typeVars ); 262 } 263 ast::Pass<ScrubTypeVars> visitor( mode, typeVars ); 264 return target->accept( visitor ); 265 } 266 267 } // namespace 268 269 template<> 270 ast::Node const * scrubAllTypeVars<ast::Node>( const ast::Node * target ) { 271 return scrubTypeVarsBase( target, ScrubMode::All, nullptr ); 272 } 273 114 274 } // namespace GenPoly 115 275 -
src/GenPoly/ScrubTyVars.h
rebf8ca5 r23a08aa0 9 9 // Author : Richard C. Bilson 10 10 // Created On : Mon May 18 07:44:20 2015 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Sat Jul 22 09:21:47 201713 // Update Count : 211 // Last Modified By : Andrew Beach 12 // Last Modified On : Fri Aug 19 14:14:00 2022 13 // Update Count : 3 14 14 // 15 15 … … 18 18 #include <cassert> // for assert 19 19 20 #include "AST/Fwd.hpp" // for Node 20 21 #include "Common/PassVisitor.h" 21 22 #include "GenPoly.h" // for TyVarMap, isPolyType, isDynType … … 108 109 } 109 110 111 /// For all polymorphic types, replaces generic types, with the appropriate 112 /// void type, and sizeof/alignof expressions with the proper variable. 113 template<typename node_t> 114 node_t const * scrubAllTypeVars( node_t const * target ) { 115 return strict_dynamic_cast<node_t const *>( scrubAllTypeVars<ast::Node>( target ) ); 116 } 117 118 template<> 119 ast::Node const * scrubAllTypeVars<ast::Node>( const ast::Node * target ); 120 110 121 } // namespace GenPoly 111 122 -
src/GenPoly/SpecializeNew.cpp
rebf8ca5 r23a08aa0 240 240 } 241 241 242 namespace { 243 struct TypeInstFixer : public ast::WithShortCircuiting { 244 std::map<const ast::TypeDecl *, std::pair<int, int>> typeMap; 245 246 void previsit(const ast::TypeDecl *) { visit_children = false; } 247 const ast::TypeInstType * postvisit(const ast::TypeInstType * typeInst) { 248 if (typeMap.count(typeInst->base)) { 249 ast::TypeInstType * newInst = mutate(typeInst); 250 auto const & pair = typeMap[typeInst->base]; 251 newInst->expr_id = pair.first; 252 newInst->formal_usage = pair.second; 253 return newInst; 254 } 255 return typeInst; 256 } 257 }; 258 } 242 struct TypeInstFixer final : public ast::WithShortCircuiting { 243 std::map<const ast::TypeDecl *, std::pair<int, int>> typeMap; 244 245 void previsit(const ast::TypeDecl *) { visit_children = false; } 246 const ast::TypeInstType * postvisit(const ast::TypeInstType * typeInst) { 247 if (typeMap.count(typeInst->base)) { 248 ast::TypeInstType * newInst = mutate(typeInst); 249 auto const & pair = typeMap[typeInst->base]; 250 newInst->expr_id = pair.first; 251 newInst->formal_usage = pair.second; 252 return newInst; 253 } 254 return typeInst; 255 } 256 }; 259 257 260 258 const ast::Expr * SpecializeCore::createThunkFunction( -
src/GenPoly/module.mk
rebf8ca5 r23a08aa0 27 27 GenPoly/FindFunction.cc \ 28 28 GenPoly/FindFunction.h \ 29 GenPoly/InstantiateGenericNew.cpp \ 29 30 GenPoly/InstantiateGeneric.cc \ 30 31 GenPoly/InstantiateGeneric.h \ -
src/InitTweak/InitTweak.cc
rebf8ca5 r23a08aa0 1241 1241 static const char * const tlsd_section = ".tdata" ASM_COMMENT; 1242 1242 void addDataSectionAttribute( ObjectDecl * objDecl ) { 1243 const bool is_tls = objDecl->get_storageClasses().is_threadlocal ;1243 const bool is_tls = objDecl->get_storageClasses().is_threadlocal_any(); 1244 1244 const char * section = is_tls ? tlsd_section : data_section; 1245 1245 objDecl->attributes.push_back(new Attribute("section", { … … 1249 1249 1250 1250 void addDataSectionAttribute( ast::ObjectDecl * objDecl ) { 1251 const bool is_tls = objDecl->storage.is_threadlocal ;1251 const bool is_tls = objDecl->storage.is_threadlocal_any(); 1252 1252 const char * section = is_tls ? tlsd_section : data_section; 1253 1253 objDecl->attributes.push_back(new ast::Attribute("section", { -
src/Makefile.am
rebf8ca5 r23a08aa0 71 71 EXTRA_DIST = include/cassert include/optional BasicTypes-gen.cc 72 72 73 AM_CXXFLAGS = @HOST_FLAGS@ -Wno-deprecated -Wall -Wextra -Werror=return-type -DDEBUG_ALL -I./Parser -I$(srcdir)/Parser -I$(srcdir)/include -DYY_NO_INPUT -O3 -g -std=c++1 4$(TCMALLOCFLAG)73 AM_CXXFLAGS = @HOST_FLAGS@ -Wno-deprecated -Wall -Wextra -Werror=return-type -DDEBUG_ALL -I./Parser -I$(srcdir)/Parser -I$(srcdir)/include -DYY_NO_INPUT -O3 -g -std=c++17 $(TCMALLOCFLAG) 74 74 AM_LDFLAGS = @HOST_FLAGS@ -Xlinker -export-dynamic 75 75 ARFLAGS = cr -
src/Parser/DeclarationNode.cc
rebf8ca5 r23a08aa0 262 262 newnode->type->enumeration.anon = name == nullptr; 263 263 if ( base && base->type) { 264 newnode->type->base = base->type; 264 newnode->type->base = base->type; 265 265 } // if 266 266 … … 505 505 } // for 506 506 // src is the new item being added and has a single bit 507 } else if ( ! src->storageClasses.is_threadlocal ) { // conflict ?507 } else if ( ! src->storageClasses.is_threadlocal_any() ) { // conflict ? 508 508 appendError( error, string( "conflicting " ) + Type::StorageClassesNames[storageClasses.ffs()] + 509 509 " & " + Type::StorageClassesNames[src->storageClasses.ffs()] ); -
src/Parser/lex.ll
rebf8ca5 r23a08aa0 10 10 * Created On : Sat Sep 22 08:58:10 2001 11 11 * Last Modified By : Peter A. Buhr 12 * Last Modified On : Sun Jun 20 18:41:09 202113 * Update Count : 7 5912 * Last Modified On : Tue Aug 30 18:39:54 2022 13 * Update Count : 760 14 14 */ 15 15 … … 314 314 switch { KEYWORD_RETURN(SWITCH); } 315 315 thread { KEYWORD_RETURN(THREAD); } // C11 316 _Thread_local { KEYWORD_RETURN(THREADLOCAL); } // C11 316 __thread { KEYWORD_RETURN(THREADLOCALGCC); } // GCC 317 _Thread_local { KEYWORD_RETURN(THREADLOCALC11); } // C11 317 318 throw { KEYWORD_RETURN(THROW); } // CFA 318 319 throwResume { KEYWORD_RETURN(THROWRESUME); } // CFA -
src/Parser/parser.yy
rebf8ca5 r23a08aa0 58 58 59 59 // lex uses __null in a boolean context, it's fine. 60 //#pragma GCC diagnostic ignored "-Wparentheses-equality" 60 #pragma GCC diagnostic ignored "-Wpragmas" 61 #pragma GCC diagnostic ignored "-Wparentheses-equality" 62 #pragma GCC diagnostic warning "-Wpragmas" 61 63 62 64 extern DeclarationNode * parseTree; … … 293 295 %token TYPEDEF 294 296 %token EXTERN STATIC AUTO REGISTER 295 %token THREADLOCAL //C11297 %token THREADLOCALGCC THREADLOCALC11 // GCC, C11 296 298 %token INLINE FORTRAN // C99, extension ISO/IEC 9899:1999 Section J.5.9(1) 297 299 %token NORETURN // C11 … … 1345 1347 { 1346 1348 if ( $2 == OperKinds::LThan || $2 == OperKinds::LEThan ) { SemanticError( yylloc, MISSING_ANON_FIELD ); $$ = nullptr; } 1347 else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; } 1349 else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; } 1348 1350 } 1349 1351 | comma_expression updowneq comma_expression '~' comma_expression // CFA, anonymous loop-index … … 1357 1359 { 1358 1360 if ( $2 == OperKinds::LThan || $2 == OperKinds::LEThan ) { SemanticError( yylloc, MISSING_ANON_FIELD ); $$ = nullptr; } 1359 else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; } 1361 else { SemanticError( yylloc, MISSING_HIGH ); $$ = nullptr; } 1360 1362 } 1361 1363 | comma_expression updowneq comma_expression '~' '@' // CFA, error … … 2082 2084 | REGISTER 2083 2085 { $$ = DeclarationNode::newStorageClass( Type::Register ); } 2084 | THREADLOCAL // C11 2085 { $$ = DeclarationNode::newStorageClass( Type::Threadlocal ); } 2086 | THREADLOCALGCC // GCC 2087 { $$ = DeclarationNode::newStorageClass( Type::ThreadlocalGcc ); } 2088 | THREADLOCALC11 // C11 2089 { $$ = DeclarationNode::newStorageClass( Type::ThreadlocalC11 ); } 2086 2090 // Put function specifiers here to simplify parsing rules, but separate them semantically. 2087 2091 | INLINE // C99 -
src/ResolvExpr/CandidateFinder.cpp
rebf8ca5 r23a08aa0 269 269 unsigned nextArg, unsigned tupleStart = 0, Cost cost = Cost::zero, 270 270 unsigned nextExpl = 0, unsigned explAlt = 0 ) 271 : parent(parent), expr( expr ), cost( cost ), env( move( env ) ), need(move( need ) ),272 have( move( have ) ), open(move( open ) ), nextArg( nextArg ), tupleStart( tupleStart ),271 : parent(parent), expr( expr ), cost( cost ), env( std::move( env ) ), need( std::move( need ) ), 272 have( std::move( have ) ), open( std::move( open ) ), nextArg( nextArg ), tupleStart( tupleStart ), 273 273 nextExpl( nextExpl ), explAlt( explAlt ) {} 274 274 … … 276 276 const ArgPack & o, ast::TypeEnvironment && env, ast::AssertionSet && need, 277 277 ast::AssertionSet && have, ast::OpenVarSet && open, unsigned nextArg, Cost added ) 278 : parent( o.parent ), expr( o.expr ), cost( o.cost + added ), env( move( env ) ),279 need( move( need ) ), have( move( have ) ), open(move( open ) ), nextArg( nextArg ),278 : parent( o.parent ), expr( o.expr ), cost( o.cost + added ), env( std::move( env ) ), 279 need( std::move( need ) ), have( std::move( have ) ), open( std::move( open ) ), nextArg( nextArg ), 280 280 tupleStart( o.tupleStart ), nextExpl( 0 ), explAlt( 0 ) {} 281 281 … … 301 301 // reset pack to appropriate tuple 302 302 std::vector< ast::ptr< ast::Expr > > exprv( exprs.begin(), exprs.end() ); 303 expr = new ast::TupleExpr{ expr->location, move( exprv ) };303 expr = new ast::TupleExpr{ expr->location, std::move( exprv ) }; 304 304 tupleStart = pack->tupleStart - 1; 305 305 parent = pack->parent; … … 404 404 newResult.open, symtab ) 405 405 ) { 406 finalResults.emplace_back( move( newResult ) );406 finalResults.emplace_back( std::move( newResult ) ); 407 407 } 408 408 … … 423 423 if ( expl.exprs.empty() ) { 424 424 results.emplace_back( 425 results[i], move( env ), copy( results[i].need ),426 copy( results[i].have ), move( open ), nextArg + 1, expl.cost );425 results[i], std::move( env ), copy( results[i].need ), 426 copy( results[i].have ), std::move( open ), nextArg + 1, expl.cost ); 427 427 428 428 continue; … … 431 431 // add new result 432 432 results.emplace_back( 433 i, expl.exprs.front(), move( env ), copy( results[i].need ),434 copy( results[i].have ), move( open ), nextArg + 1, nTuples,433 i, expl.exprs.front(), std::move( env ), copy( results[i].need ), 434 copy( results[i].have ), std::move( open ), nextArg + 1, nTuples, 435 435 expl.cost, expl.exprs.size() == 1 ? 0 : 1, j ); 436 436 } … … 444 444 // splice final results onto results 445 445 for ( std::size_t i = 0; i < finalResults.size(); ++i ) { 446 results.emplace_back( move( finalResults[i] ) );446 results.emplace_back( std::move( finalResults[i] ) ); 447 447 } 448 448 return ! finalResults.empty(); … … 478 478 479 479 results.emplace_back( 480 i, expr, move( env ), move( need ), move( have ),move( open ), nextArg,480 i, expr, std::move( env ), std::move( need ), std::move( have ), std::move( open ), nextArg, 481 481 nTuples, Cost::zero, nextExpl, results[i].explAlt ); 482 482 } … … 494 494 if ( unify( paramType, cnst->result, env, need, have, open, symtab ) ) { 495 495 results.emplace_back( 496 i, new ast::DefaultArgExpr{ cnst->location, cnst }, move( env ),497 move( need ), move( have ),move( open ), nextArg, nTuples );496 i, new ast::DefaultArgExpr{ cnst->location, cnst }, std::move( env ), 497 std::move( need ), std::move( have ), std::move( open ), nextArg, nTuples ); 498 498 } 499 499 } … … 516 516 if ( expl.exprs.empty() ) { 517 517 results.emplace_back( 518 results[i], move( env ), move( need ), move( have ),move( open ),518 results[i], std::move( env ), std::move( need ), std::move( have ), std::move( open ), 519 519 nextArg + 1, expl.cost ); 520 520 … … 538 538 // add new result 539 539 results.emplace_back( 540 i, expr, move( env ), move( need ), move( have ),move( open ),540 i, expr, std::move( env ), std::move( need ), std::move( have ), std::move( open ), 541 541 nextArg + 1, nTuples, expl.cost, expl.exprs.size() == 1 ? 0 : 1, j ); 542 542 } … … 576 576 restructureCast( idx, toType->getComponent( i ), isGenerated ) ); 577 577 } 578 return new ast::TupleExpr{ arg->location, move( components ) };578 return new ast::TupleExpr{ arg->location, std::move( components ) }; 579 579 } else { 580 580 // handle normally … … 672 672 } 673 673 std::vector< ast::ptr< ast::Expr > > vargs( args.begin(), args.end() ); 674 appExpr->args = move( vargs );674 appExpr->args = std::move( vargs ); 675 675 // build and validate new candidate 676 676 auto newCand = … … 783 783 if ( expl.exprs.empty() ) { 784 784 results.emplace_back( 785 results[i], move( env ), copy( results[i].need ),786 copy( results[i].have ), move( open ), nextArg + 1,785 results[i], std::move( env ), copy( results[i].need ), 786 copy( results[i].have ), std::move( open ), nextArg + 1, 787 787 expl.cost ); 788 788 … … 792 792 // add new result 793 793 results.emplace_back( 794 i, expl.exprs.front(), move( env ), copy( results[i].need ),795 copy( results[i].have ), move( open ), nextArg + 1, 0, expl.cost,794 i, expl.exprs.front(), std::move( env ), copy( results[i].need ), 795 copy( results[i].have ), std::move( open ), nextArg + 1, 0, expl.cost, 796 796 expl.exprs.size() == 1 ? 0 : 1, j ); 797 797 } … … 843 843 // as a member expression 844 844 addAnonConversions( newCand ); 845 candidates.emplace_back( move( newCand ) );845 candidates.emplace_back( std::move( newCand ) ); 846 846 } 847 847 } … … 901 901 const ast::EnumDecl * enumDecl = enumInst->base; 902 902 if ( const ast::Type* enumType = enumDecl->base ) { 903 // instance of enum (T) is a instance of type (T) 903 // instance of enum (T) is a instance of type (T) 904 904 funcFinder.otypeKeys.insert(Mangle::mangle(enumType, Mangle::NoGenericParams | Mangle::Type)); 905 905 } else { … … 907 907 funcFinder.otypeKeys.insert(Mangle::mangle(enumDecl, Mangle::NoGenericParams | Mangle::Type)); 908 908 } 909 } 909 } 910 910 else funcFinder.otypeKeys.insert(Mangle::mangle(argType, Mangle::NoGenericParams | Mangle::Type)); 911 911 } … … 986 986 funcE.emplace_back( *func, symtab ); 987 987 } 988 argExpansions.emplace_front( move( funcE ) );988 argExpansions.emplace_front( std::move( funcE ) ); 989 989 990 990 for ( const CandidateRef & op : opFinder ) { … … 1030 1030 if ( cvtCost != Cost::infinity ) { 1031 1031 withFunc->cvtCost = cvtCost; 1032 candidates.emplace_back( move( withFunc ) );1033 } 1034 } 1035 found = move( candidates );1032 candidates.emplace_back( std::move( withFunc ) ); 1033 } 1034 } 1035 found = std::move( candidates ); 1036 1036 1037 1037 // use a new list so that candidates are not examined by addAnonConversions twice … … 1131 1131 CandidateRef newCand = std::make_shared<Candidate>( 1132 1132 restructureCast( cand->expr, toType, castExpr->isGenerated ), 1133 copy( cand->env ), move( open ),move( need ), cand->cost,1133 copy( cand->env ), std::move( open ), std::move( need ), cand->cost, 1134 1134 cand->cost + thisCost ); 1135 1135 inferParameters( newCand, matches ); … … 1285 1285 // as a name expression 1286 1286 addAnonConversions( newCand ); 1287 candidates.emplace_back( move( newCand ) );1287 candidates.emplace_back( std::move( newCand ) ); 1288 1288 } 1289 1289 } … … 1394 1394 new ast::LogicalExpr{ 1395 1395 logicalExpr->location, r1->expr, r2->expr, logicalExpr->isAnd }, 1396 move( env ), move( open ),move( need ), r1->cost + r2->cost );1396 std::move( env ), std::move( open ), std::move( need ), r1->cost + r2->cost ); 1397 1397 } 1398 1398 } … … 1452 1452 // output candidate 1453 1453 CandidateRef newCand = std::make_shared<Candidate>( 1454 newExpr, move( env ), move( open ),move( need ), cost );1454 newExpr, std::move( env ), std::move( open ), std::move( need ), cost ); 1455 1455 inferParameters( newCand, candidates ); 1456 1456 } … … 1519 1519 // add candidate 1520 1520 CandidateRef newCand = std::make_shared<Candidate>( 1521 newExpr, move( env ), move( open ),move( need ),1521 newExpr, std::move( env ), std::move( open ), std::move( need ), 1522 1522 r1->cost + r2->cost ); 1523 1523 inferParameters( newCand, candidates ); … … 1548 1548 1549 1549 addCandidate( 1550 new ast::TupleExpr{ tupleExpr->location, move( exprs ) },1551 move( env ), move( open ),move( need ), sumCost( subs ) );1550 new ast::TupleExpr{ tupleExpr->location, std::move( exprs ) }, 1551 std::move( env ), std::move( open ), std::move( need ), sumCost( subs ) ); 1552 1552 } 1553 1553 } … … 1635 1635 initExpr->location, restructureCast( cand->expr, toType ), 1636 1636 initAlt.designation }, 1637 move(env), move( open ),move( need ), cand->cost, thisCost );1637 std::move(env), std::move( open ), std::move( need ), cand->cost, thisCost ); 1638 1638 inferParameters( newCand, matches ); 1639 1639 } … … 1768 1768 cand->env.applyFree( newResult ); 1769 1769 cand->expr = ast::mutate_field( 1770 cand->expr.get(), &ast::Expr::result, move( newResult ) );1770 cand->expr.get(), &ast::Expr::result, std::move( newResult ) ); 1771 1771 1772 1772 out.emplace_back( cand ); … … 1854 1854 1855 1855 auto oldsize = candidates.size(); 1856 candidates = move( pruned );1856 candidates = std::move( pruned ); 1857 1857 1858 1858 PRINT( -
src/SynTree/Statement.cc
rebf8ca5 r23a08aa0 105 105 }; 106 106 107 BranchStmt::BranchStmt( Label target, Type type ) throw ( SemanticErrorException ):107 BranchStmt::BranchStmt( Label target, Type type ) : 108 108 Statement(), originalTarget( target ), target( target ), computedTarget( nullptr ), type( type ) { 109 109 //actually this is a syntactic error signaled by the parser … … 113 113 } 114 114 115 BranchStmt::BranchStmt( Expression * computedTarget, Type type ) throw ( SemanticErrorException ):115 BranchStmt::BranchStmt( Expression * computedTarget, Type type ) : 116 116 Statement(), computedTarget( computedTarget ), type( type ) { 117 117 if ( type != BranchStmt::Goto || computedTarget == nullptr ) { … … 211 211 } 212 212 213 CaseStmt::CaseStmt( Expression * condition, const list<Statement *> & statements, bool deflt ) throw ( SemanticErrorException ):213 CaseStmt::CaseStmt( Expression * condition, const list<Statement *> & statements, bool deflt ) : 214 214 Statement(), condition( condition ), stmts( statements ), _isDefault( deflt ) { 215 215 if ( isDefault() && condition != nullptr ) SemanticError( condition, "default case with condition: " ); … … 575 575 } 576 576 577 MutexStmt::MutexStmt( Statement * stmt, const list<Expression *> mutexObjs ) 577 MutexStmt::MutexStmt( Statement * stmt, const list<Expression *> mutexObjs ) 578 578 : Statement(), stmt( stmt ), mutexObjs( mutexObjs ) { } 579 579 -
src/SynTree/Statement.h
rebf8ca5 r23a08aa0 200 200 std::list<Statement *> stmts; 201 201 202 CaseStmt( Expression * conditions, const std::list<Statement *> & stmts, bool isdef = false ) throw (SemanticErrorException);202 CaseStmt( Expression * conditions, const std::list<Statement *> & stmts, bool isdef = false ); 203 203 CaseStmt( const CaseStmt & other ); 204 204 virtual ~CaseStmt(); … … 289 289 Type type; 290 290 291 BranchStmt( Label target, Type ) throw (SemanticErrorException);292 BranchStmt( Expression * computedTarget, Type ) throw (SemanticErrorException);291 BranchStmt( Label target, Type ); 292 BranchStmt( Expression * computedTarget, Type ); 293 293 294 294 Label get_originalTarget() { return originalTarget; } -
src/SynTree/Type.cc
rebf8ca5 r23a08aa0 80 80 // These must remain in the same order as the corresponding bit fields. 81 81 const char * Type::FuncSpecifiersNames[] = { "inline", "_Noreturn", "fortran" }; 82 const char * Type::StorageClassesNames[] = { "extern", "static", "auto", "register", "_ Thread_local" };82 const char * Type::StorageClassesNames[] = { "extern", "static", "auto", "register", "__thread", "_Thread_local" }; 83 83 const char * Type::QualifiersNames[] = { "const", "restrict", "volatile", "mutex", "_Atomic" }; 84 84 -
src/SynTree/Type.h
rebf8ca5 r23a08aa0 84 84 }; // FuncSpecifiers 85 85 86 enum { Extern = 1 << 0, Static = 1 << 1, Auto = 1 << 2, Register = 1 << 3, Threadlocal = 1 << 4, NumStorageClass = 5};86 enum { Extern = 1 << 0, Static = 1 << 1, Auto = 1 << 2, Register = 1 << 3, ThreadlocalGcc = 1 << 4, ThreadlocalC11 = 1 << 5, NumStorageClass = 6 }; 87 87 static const char * StorageClassesNames[]; 88 88 union StorageClasses { … … 93 93 bool is_auto : 1; 94 94 bool is_register : 1; 95 bool is_threadlocal : 1; 95 bool is_threadlocalGcc : 1; 96 bool is_threadlocalC11 : 1; 96 97 }; 97 98 … … 100 101 // equality (==, !=) works implicitly on first field "val", relational operations are undefined. 101 102 BFCommon( StorageClasses, NumStorageClass ) 103 104 bool is_threadlocal_any() { return this->is_threadlocalC11 || this->is_threadlocalGcc; } 102 105 }; // StorageClasses 103 106 -
src/Tuples/TupleExpansionNew.cpp
rebf8ca5 r23a08aa0 101 101 102 102 /// Replaces Tuple Assign & Index Expressions, and Tuple Types. 103 struct TupleMainExpander :103 struct TupleMainExpander final : 104 104 public ast::WithGuards, 105 105 public ast::WithVisitorRef<TupleMainExpander>, … … 254 254 } 255 255 256 struct TupleExprExpander {256 struct TupleExprExpander final { 257 257 ast::Expr const * postvisit( ast::TupleExpr const * expr ) { 258 258 return replaceTupleExpr( expr->location, -
src/Virtual/ExpandCasts.cc
rebf8ca5 r23a08aa0 317 317 }; 318 318 319 struct ExpandCastsCore {319 struct ExpandCastsCore final { 320 320 void previsit( ast::FunctionDecl const * decl ); 321 321 void previsit( ast::StructDecl const * decl ); … … 362 362 } 363 363 364 /// Copy newType, but give the copy the params of the oldType. 364 365 ast::StructInstType * polyCopy( 365 366 ast::StructInstType const * oldType, -
src/config.h.in
rebf8ca5 r23a08aa0 27 27 /* Location of cfa install. */ 28 28 #undef CFA_PREFIX 29 30 /* Sets whether or not to use the new-ast, this is adefault value and can be31 overrided by --old-ast and --new-ast */32 #undef CFA_USE_NEW_AST33 29 34 30 /* Major.Minor */ -
src/main.cc
rebf8ca5 r23a08aa0 10 10 // Created On : Fri May 15 23:12:02 2015 11 11 // Last Modified By : Andrew Beach 12 // Last Modified On : Thu 11 12:18:00 202213 // Update Count : 67 712 // Last Modified On : Thu Sep 15 13:58:00 2022 13 // Update Count : 678 14 14 // 15 15 … … 38 38 #include "CodeGen/Generate.h" // for generate 39 39 #include "CodeGen/LinkOnce.h" // for translateLinkOnce 40 #include "CodeTools/DeclStats.h" // for printDeclStats41 #include "CodeTools/ResolvProtoDump.h" // for dumpAsResolvProto42 40 #include "CodeTools/TrackLoc.h" // for fillLocations 43 41 #include "Common/CodeLocationTools.hpp" // for forceFillCodeLocations … … 45 43 #include "Common/DeclStats.hpp" // for printDeclStats 46 44 #include "Common/ResolvProtoDump.hpp" // for dumpAsResolverProto 47 #include "Common/Stats.h" 48 #include "Common/PassVisitor.h" 49 #include "Common/SemanticError.h" // for SemanticError 45 #include "Common/Stats.h" // for Stats 50 46 #include "Common/UnimplementedError.h" // for UnimplementedError 51 47 #include "Common/utility.h" // for deleteAll, filter, printAll … … 53 49 #include "Concurrency/Waitfor.h" // for generateWaitfor 54 50 #include "ControlStruct/ExceptDecl.h" // for translateExcept 55 #include "ControlStruct/ExceptTranslate.h" // for translate EHM51 #include "ControlStruct/ExceptTranslate.h" // for translateThrows, translat... 56 52 #include "ControlStruct/FixLabels.hpp" // for fixLabels 57 53 #include "ControlStruct/HoistControlDecls.hpp" // hoistControlDecls 58 #include "ControlStruct/Mutate.h" // for mutate59 54 #include "GenPoly/Box.h" // for box 60 55 #include "GenPoly/InstantiateGeneric.h" // for instantiateGeneric … … 66 61 #include "Parser/ParseNode.h" // for DeclarationNode, buildList 67 62 #include "Parser/TypedefTable.h" // for TypedefTable 68 #include "ResolvExpr/AlternativePrinter.h" // for AlternativePrinter69 63 #include "ResolvExpr/CandidatePrinter.hpp" // for printCandidates 70 64 #include "ResolvExpr/Resolver.h" // for resolve 71 #include "SymTab/Validate.h" // for validate72 #include "SymTab/ValidateType.h" // for linkReferenceToTypes73 65 #include "SynTree/LinkageSpec.h" // for Spec, Cforall, Intrinsic 74 66 #include "SynTree/Declaration.h" // for Declaration 75 #include "SynTree/Visitor.h" // for acceptAll76 67 #include "Tuples/Tuples.h" // for expandMemberTuples, expan... 77 68 #include "Validate/Autogen.hpp" // for autogenerateRoutines … … 330 321 Stats::Time::StopBlock(); 331 322 332 if( useNewAST ) { 333 if (Stats::Counters::enabled) { 334 ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New"); 335 ast::pass_visitor_stats.max = Stats::Counters::build<Stats::Counters::MaxCounter<double>>("Max depth - New"); 336 } 337 auto transUnit = convert( move( translationUnit ) ); 338 339 forceFillCodeLocations( transUnit ); 340 341 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) ); 342 if ( exdeclp ) { 343 dump( move( transUnit ) ); 344 return EXIT_SUCCESS; 345 } 346 347 PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) ); 348 PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) ); 349 // Hoist Type Decls pulls some declarations out of contexts where 350 // locations are not tracked. Perhaps they should be, but for now 351 // the full fill solves it. 352 forceFillCodeLocations( transUnit ); 353 354 PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) ); 355 PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) ); 356 PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) ); 357 358 PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) ); 359 360 PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) ); 361 PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) ); 362 PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) ); 363 PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) ); 364 PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) ); 365 PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) ); 366 PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) ); 367 PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) ); 368 PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) ); 369 PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) ); 370 371 PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) ); 372 373 PASS( "Implement Mutex", Concurrency::implementMutex( transUnit ) ); 374 PASS( "Implement Thread Start", Concurrency::implementThreadStarter( transUnit ) ); 375 PASS( "Compound Literal", Validate::handleCompoundLiterals( transUnit ) ); 376 PASS( "Set Length From Initializer", Validate::setLengthFromInitializer( transUnit ) ); 377 PASS( "Find Global Decls", Validate::findGlobalDecls( transUnit ) ); 378 PASS( "Fix Label Address", Validate::fixLabelAddresses( transUnit ) ); 379 380 if ( symtabp ) { 381 return EXIT_SUCCESS; 382 } // if 383 384 if ( expraltp ) { 385 ResolvExpr::printCandidates( transUnit ); 386 return EXIT_SUCCESS; 387 } // if 388 389 if ( validp ) { 390 dump( move( transUnit ) ); 391 return EXIT_SUCCESS; 392 } // if 393 394 PASS( "Translate Throws", ControlStruct::translateThrows( transUnit ) ); 395 PASS( "Fix Labels", ControlStruct::fixLabels( transUnit ) ); 396 PASS( "Fix Names", CodeGen::fixNames( transUnit ) ); 397 PASS( "Gen Init", InitTweak::genInit( transUnit ) ); 398 PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( transUnit ) ); 399 400 if ( libcfap ) { 401 // Generate the bodies of cfa library functions. 402 LibCfa::makeLibCfa( transUnit ); 403 } // if 404 405 if ( declstatsp ) { 406 printDeclStats( transUnit ); 407 return EXIT_SUCCESS; 408 } // if 409 410 if ( bresolvep ) { 411 dump( move( transUnit ) ); 412 return EXIT_SUCCESS; 413 } // if 414 415 if ( resolvprotop ) { 416 dumpAsResolverProto( transUnit ); 417 return EXIT_SUCCESS; 418 } // if 419 420 PASS( "Resolve", ResolvExpr::resolve( transUnit ) ); 421 if ( exprp ) { 422 dump( move( transUnit ) ); 423 return EXIT_SUCCESS; 424 } // if 425 426 forceFillCodeLocations( transUnit ); 427 428 PASS( "Fix Init", InitTweak::fix(transUnit, buildingLibrary())); 429 430 // fix ObjectDecl - replaces ConstructorInit nodes 431 if ( ctorinitp ) { 432 dump( move( transUnit ) ); 433 return EXIT_SUCCESS; 434 } // if 435 436 // Currently not working due to unresolved issues with UniqueExpr 437 PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( transUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused 438 439 PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) ); 440 PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) ); 441 442 // Needs to happen before tuple types are expanded. 443 PASS( "Convert Specializations", GenPoly::convertSpecializations( transUnit ) ); 444 445 PASS( "Expand Tuples", Tuples::expandTuples( transUnit ) ); 446 447 if ( tuplep ) { 448 dump( move( transUnit ) ); 449 return EXIT_SUCCESS; 450 } // if 451 452 // Must come after Translate Tries. 453 PASS( "Virtual Expand Casts", Virtual::expandCasts( transUnit ) ); 454 455 translationUnit = convert( move( transUnit ) ); 456 } else { 457 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( translationUnit ) ); 458 if ( exdeclp ) { 459 dump( translationUnit ); 460 return EXIT_SUCCESS; 461 } // if 462 463 // add the assignment statement after the initialization of a type parameter 464 PASS( "Validate", SymTab::validate( translationUnit ) ); 465 466 if ( symtabp ) { 467 deleteAll( translationUnit ); 468 return EXIT_SUCCESS; 469 } // if 470 471 if ( expraltp ) { 472 PassVisitor<ResolvExpr::AlternativePrinter> printer( cout ); 473 acceptAll( translationUnit, printer ); 474 return EXIT_SUCCESS; 475 } // if 476 477 if ( validp ) { 478 dump( translationUnit ); 479 return EXIT_SUCCESS; 480 } // if 481 482 PASS( "Translate Throws", ControlStruct::translateThrows( translationUnit ) ); 483 PASS( "Fix Labels", ControlStruct::fixLabels( translationUnit ) ); 484 PASS( "Fix Names", CodeGen::fixNames( translationUnit ) ); 485 PASS( "Gen Init", InitTweak::genInit( translationUnit ) ); 486 PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( translationUnit ) ); 487 488 if ( libcfap ) { 489 // Generate the bodies of cfa library functions. 490 LibCfa::makeLibCfa( translationUnit ); 491 } // if 492 493 if ( declstatsp ) { 494 CodeTools::printDeclStats( translationUnit ); 495 deleteAll( translationUnit ); 496 return EXIT_SUCCESS; 497 } // if 498 499 if ( bresolvep ) { 500 dump( translationUnit ); 501 return EXIT_SUCCESS; 502 } // if 503 504 CodeTools::fillLocations( translationUnit ); 505 506 if ( resolvprotop ) { 507 CodeTools::dumpAsResolvProto( translationUnit ); 508 return EXIT_SUCCESS; 509 } // if 510 511 PASS( "Resolve", ResolvExpr::resolve( translationUnit ) ); 512 if ( exprp ) { 513 dump( translationUnit ); 514 return EXIT_SUCCESS; 515 } 516 517 PASS( "Fix Init", InitTweak::fix( translationUnit, buildingLibrary() ) ); 518 519 // fix ObjectDecl - replaces ConstructorInit nodes 520 if ( ctorinitp ) { 521 dump ( translationUnit ); 522 return EXIT_SUCCESS; 523 } // if 524 525 PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( translationUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused 526 PASS( "Translate Tries", ControlStruct::translateTries( translationUnit ) ); 527 PASS( "Gen Waitfor", Concurrency::generateWaitFor( translationUnit ) ); 528 PASS( "Convert Specializations", GenPoly::convertSpecializations( translationUnit ) ); // needs to happen before tuple types are expanded 529 PASS( "Expand Tuples", Tuples::expandTuples( translationUnit ) ); // xxx - is this the right place for this? 530 531 if ( tuplep ) { 532 dump( translationUnit ); 533 return EXIT_SUCCESS; 534 } // if 535 536 PASS( "Virtual Expand Casts", Virtual::expandCasts( translationUnit ) ); // Must come after translateEHM 323 if (Stats::Counters::enabled) { 324 ast::pass_visitor_stats.avg = Stats::Counters::build<Stats::Counters::AverageCounter<double>>("Average Depth - New"); 325 ast::pass_visitor_stats.max = Stats::Counters::build<Stats::Counters::MaxCounter<double>>("Max depth - New"); 537 326 } 538 539 PASS( "Instantiate Generics", GenPoly::instantiateGeneric( translationUnit ) ); 327 auto transUnit = convert( std::move( translationUnit ) ); 328 329 forceFillCodeLocations( transUnit ); 330 331 PASS( "Translate Exception Declarations", ControlStruct::translateExcept( transUnit ) ); 332 if ( exdeclp ) { 333 dump( std::move( transUnit ) ); 334 return EXIT_SUCCESS; 335 } 336 337 PASS( "Verify Ctor, Dtor & Assign", Validate::verifyCtorDtorAssign( transUnit ) ); 338 PASS( "Hoist Type Decls", Validate::hoistTypeDecls( transUnit ) ); 339 // Hoist Type Decls pulls some declarations out of contexts where 340 // locations are not tracked. Perhaps they should be, but for now 341 // the full fill solves it. 342 forceFillCodeLocations( transUnit ); 343 344 PASS( "Replace Typedefs", Validate::replaceTypedef( transUnit ) ); 345 PASS( "Fix Return Types", Validate::fixReturnTypes( transUnit ) ); 346 PASS( "Enum and Pointer Decay", Validate::decayEnumsAndPointers( transUnit ) ); 347 348 PASS( "Link Reference To Types", Validate::linkReferenceToTypes( transUnit ) ); 349 350 PASS( "Fix Qualified Types", Validate::fixQualifiedTypes( transUnit ) ); 351 PASS( "Hoist Struct", Validate::hoistStruct( transUnit ) ); 352 PASS( "Eliminate Typedef", Validate::eliminateTypedef( transUnit ) ); 353 PASS( "Validate Generic Parameters", Validate::fillGenericParameters( transUnit ) ); 354 PASS( "Translate Dimensions", Validate::translateDimensionParameters( transUnit ) ); 355 PASS( "Check Function Returns", Validate::checkReturnStatements( transUnit ) ); 356 PASS( "Fix Return Statements", InitTweak::fixReturnStatements( transUnit ) ); 357 PASS( "Implement Concurrent Keywords", Concurrency::implementKeywords( transUnit ) ); 358 PASS( "Forall Pointer Decay", Validate::decayForallPointers( transUnit ) ); 359 PASS( "Hoist Control Declarations", ControlStruct::hoistControlDecls( transUnit ) ); 360 361 PASS( "Generate Autogen Routines", Validate::autogenerateRoutines( transUnit ) ); 362 363 PASS( "Implement Mutex", Concurrency::implementMutex( transUnit ) ); 364 PASS( "Implement Thread Start", Concurrency::implementThreadStarter( transUnit ) ); 365 PASS( "Compound Literal", Validate::handleCompoundLiterals( transUnit ) ); 366 PASS( "Set Length From Initializer", Validate::setLengthFromInitializer( transUnit ) ); 367 PASS( "Find Global Decls", Validate::findGlobalDecls( transUnit ) ); 368 PASS( "Fix Label Address", Validate::fixLabelAddresses( transUnit ) ); 369 370 if ( symtabp ) { 371 return EXIT_SUCCESS; 372 } // if 373 374 if ( expraltp ) { 375 ResolvExpr::printCandidates( transUnit ); 376 return EXIT_SUCCESS; 377 } // if 378 379 if ( validp ) { 380 dump( std::move( transUnit ) ); 381 return EXIT_SUCCESS; 382 } // if 383 384 PASS( "Translate Throws", ControlStruct::translateThrows( transUnit ) ); 385 PASS( "Fix Labels", ControlStruct::fixLabels( transUnit ) ); 386 PASS( "Fix Names", CodeGen::fixNames( transUnit ) ); 387 PASS( "Gen Init", InitTweak::genInit( transUnit ) ); 388 PASS( "Expand Member Tuples" , Tuples::expandMemberTuples( transUnit ) ); 389 390 if ( libcfap ) { 391 // Generate the bodies of cfa library functions. 392 LibCfa::makeLibCfa( transUnit ); 393 } // if 394 395 if ( declstatsp ) { 396 printDeclStats( transUnit ); 397 return EXIT_SUCCESS; 398 } // if 399 400 if ( bresolvep ) { 401 dump( std::move( transUnit ) ); 402 return EXIT_SUCCESS; 403 } // if 404 405 if ( resolvprotop ) { 406 dumpAsResolverProto( transUnit ); 407 return EXIT_SUCCESS; 408 } // if 409 410 PASS( "Resolve", ResolvExpr::resolve( transUnit ) ); 411 if ( exprp ) { 412 dump( std::move( transUnit ) ); 413 return EXIT_SUCCESS; 414 } // if 415 416 forceFillCodeLocations( transUnit ); 417 418 PASS( "Fix Init", InitTweak::fix(transUnit, buildingLibrary())); 419 420 // fix ObjectDecl - replaces ConstructorInit nodes 421 if ( ctorinitp ) { 422 dump( std::move( transUnit ) ); 423 return EXIT_SUCCESS; 424 } // if 425 426 // Currently not working due to unresolved issues with UniqueExpr 427 PASS( "Expand Unique Expr", Tuples::expandUniqueExpr( transUnit ) ); // xxx - is this the right place for this? want to expand ASAP so tha, sequent passes don't need to worry about double-visiting a unique expr - needs to go after InitTweak::fix so that copy constructed return declarations are reused 428 429 PASS( "Translate Tries", ControlStruct::translateTries( transUnit ) ); 430 PASS( "Gen Waitfor", Concurrency::generateWaitFor( transUnit ) ); 431 432 // Needs to happen before tuple types are expanded. 433 PASS( "Convert Specializations", GenPoly::convertSpecializations( transUnit ) ); 434 435 PASS( "Expand Tuples", Tuples::expandTuples( transUnit ) ); 436 437 if ( tuplep ) { 438 dump( std::move( transUnit ) ); 439 return EXIT_SUCCESS; 440 } // if 441 442 // Must come after Translate Tries. 443 PASS( "Virtual Expand Casts", Virtual::expandCasts( transUnit ) ); 444 445 PASS( "Instantiate Generics", GenPoly::instantiateGeneric( transUnit ) ); 446 447 translationUnit = convert( std::move( transUnit ) ); 448 540 449 if ( genericsp ) { 541 450 dump( translationUnit ); … … 620 529 621 530 622 static const char optstring[] = ":c:ghlLmNnpd OAP:S:twW:D:";531 static const char optstring[] = ":c:ghlLmNnpdP:S:twW:D:"; 623 532 624 533 enum { PreludeDir = 128 }; … … 634 543 { "prototypes", no_argument, nullptr, 'p' }, 635 544 { "deterministic-out", no_argument, nullptr, 'd' }, 636 { "old-ast", no_argument, nullptr, 'O'},637 { "new-ast", no_argument, nullptr, 'A'},638 545 { "print", required_argument, nullptr, 'P' }, 639 546 { "prelude-dir", required_argument, nullptr, PreludeDir }, … … 657 564 "do not generate prelude prototypes => prelude not printed", // -p 658 565 "only print deterministic output", // -d 659 "Use the old-ast", // -O660 "Use the new-ast", // -A661 566 "print", // -P 662 567 "<directory> prelude directory for debug/nodebug", // no flag … … 767 672 deterministic_output = true; 768 673 break; 769 case 'O': // don't print non-deterministic output770 useNewAST = false;771 break;772 case 'A': // don't print non-deterministic output773 useNewAST = true;774 break;775 674 case 'P': // print options 776 675 for ( int i = 0;; i += 1 ) { … … 889 788 890 789 static void dump( ast::TranslationUnit && transUnit, ostream & out ) { 891 std::list< Declaration * > translationUnit = convert( move( transUnit ) );790 std::list< Declaration * > translationUnit = convert( std::move( transUnit ) ); 892 791 dump( translationUnit, out ); 893 792 } -
tests/.expect/declarationSpecifier.arm64.txt
rebf8ca5 r23a08aa0 735 735 } 736 736 static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1; 737 _Thread_local signed int _X3x37i_1; 738 __thread signed int _X3x38i_1; 737 739 static inline volatile const signed int _X3f11Fi___1(); 738 740 static inline volatile const signed int _X3f12Fi___1(); -
tests/.expect/declarationSpecifier.x64.txt
rebf8ca5 r23a08aa0 735 735 } 736 736 static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1; 737 _Thread_local signed int _X3x37i_1; 738 __thread signed int _X3x38i_1; 737 739 static inline volatile const signed int _X3f11Fi___1(); 738 740 static inline volatile const signed int _X3f12Fi___1(); -
tests/.expect/declarationSpecifier.x86.txt
rebf8ca5 r23a08aa0 735 735 } 736 736 static volatile const struct __anonymous15 _X3x36KVS13__anonymous15_1; 737 _Thread_local signed int _X3x37i_1; 738 __thread signed int _X3x38i_1; 737 739 static inline volatile const signed int _X3f11Fi___1(); 738 740 static inline volatile const signed int _X3f12Fi___1(); -
tests/Makefile.am
rebf8ca5 r23a08aa0 54 54 55 55 # adjust CC to current flags 56 CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})56 CC = LC_ALL=C $(if $(DISTCC_CFA_PATH),distcc $(DISTCC_CFA_PATH) ${ARCH_FLAGS} ,$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS}) 57 57 CFACC = $(CC) 58 58 … … 61 61 62 62 # adjusted CC but without the actual distcc call 63 CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ${AST_FLAGS},$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS} ${AST_FLAGS})63 CFACCLOCAL = $(if $(DISTCC_CFA_PATH),$(DISTCC_CFA_PATH) ${ARCH_FLAGS} ,$(TARGET_CFA) ${DEBUG_FLAGS} ${ARCH_FLAGS}) 64 64 CFACCLINK = $(CFACCLOCAL) -quiet $(if $(test), 2> $(test), ) $($(shell echo "${@}_FLAGSLD" | sed 's/-\|\//_/g')) 65 65 -
tests/concurrent/clib.c
rebf8ca5 r23a08aa0 8 8 } 9 9 10 thread_local struct drand48_data buffer = { 0 };10 _Thread_local struct drand48_data buffer = { 0 }; 11 11 int myrand() { 12 12 long int result; -
tests/concurrent/clib_tls.c
rebf8ca5 r23a08aa0 14 14 15 15 16 thread_localint checkval = 0xBAADF00D;16 __thread int checkval = 0xBAADF00D; 17 17 18 18 void init(void * ) { -
tests/concurrent/park/contention.cfa
rebf8ca5 r23a08aa0 2 2 #include <thread.hfa> 3 3 4 thread_localdrand48_data buffer = { 0 };4 __thread drand48_data buffer = { 0 }; 5 5 int myrand() { 6 6 long int result; -
tests/config.py.in
rebf8ca5 r23a08aa0 9 9 HOSTARCH = "@host_cpu@" 10 10 DISTRIBUTE = @HAS_DISTCC@ 11 NEWAST = @DEFAULT_NEW_AST@ -
tests/declarationSpecifier.cfa
rebf8ca5 r23a08aa0 1 // 1 // 2 2 // Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo 3 3 // 4 4 // The contents of this file are covered under the licence agreement in the 5 5 // file "LICENCE" distributed with Cforall. 6 // 7 // declarationSpecifier.cfa -- 8 // 6 // 7 // declarationSpecifier.cfa -- 8 // 9 9 // Author : Peter A. Buhr 10 10 // Created On : Wed Aug 17 08:21:04 2016 … … 12 12 // Last Modified On : Tue Apr 30 18:20:36 2019 13 13 // Update Count : 4 14 // 14 // 15 15 16 16 typedef short int Int; … … 51 51 struct { Int i; } const static volatile x35; 52 52 struct { Int i; } const volatile static x36; 53 54 _Thread_local int x37; 55 __thread int x38; 53 56 54 57 static inline const volatile int f11(); -
tests/io/comp_fair.cfa
rebf8ca5 r23a08aa0 27 27 28 28 struct { 29 30 29 barrier & bar; 30 int pipe[2]; 31 31 32 32 } globals; … … 65 65 thread Reader {}; 66 66 void main(Reader & this) { 67 bool do_read = has_user_level_blocking( (fptr_t)async_read ); 67 char thrash[1]; 68 bool do_read = has_user_level_blocking( (fptr_t)async_read ); 68 69 69 for(TIMES) { 70 io_future_t f; 71 if ( do_read ) { 72 char thrash[1]; 73 async_read(f, globals.pipe[0], thrash, 1, 0); 74 } else { 75 fulfil(f, 0); // If we don't have user-level blocking just play along 76 } 70 for(TIMES) { 71 io_future_t f; 72 if ( do_read ) { 73 async_read(f, globals.pipe[0], thrash, 1, 0); 74 } else { 75 fulfil(f, 0); // If we don't have user-level blocking just play along 76 } 77 77 78 78 block( globals.bar ); 79 79 80 80 yield( prng( this, 15 ) ); 81 81 82 82 unsigned i = __atomic_add_fetch( &counter, 1, __ATOMIC_SEQ_CST ); 83 83 if(0 == (i % 100)) sout | i; 84 84 85 85 wait( f ); 86 86 87 88 87 if(f.result < 0) 88 abort | "Read error" | -f.result | ":" | strerror(-f.result); 89 89 90 91 90 block( globals.bar ); 91 } 92 92 } 93 93 … … 97 97 thread Writer {}; 98 98 void main(Writer & this) { 99 100 99 for(TIMES) { 100 block( globals.bar ); 101 101 102 102 sleep( 1`us ); 103 103 104 105 106 107 104 char buf[1] = { '+' }; 105 int ret = write( globals.pipe[1], buf, 1 ); 106 if(ret < 0) 107 abort | "Write error" | errno | ":" | strerror(errno); 108 108 109 110 109 block( globals.bar ); 110 } 111 111 } 112 112 … … 122 122 123 123 int main() { 124 125 126 127 128 124 barrier bar = { 2 }; 125 &globals.bar = &bar; 126 int ret = pipe(globals.pipe); 127 if(ret != 0) 128 abort | "Pipe error" | errno | ":" | strerror(errno); 129 129 130 130 processor p; … … 134 134 Spinner s; 135 135 Reader ior; 136 136 Writer iow; 137 137 } 138 138 sout | "done"; -
tests/meta/.expect/arch.arm64.txt
rebf8ca5 r23a08aa0 1 meta/arch Vast.cfa:28:1 error: Cannot choose between 3 alternatives for expression1 meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression 2 2 Explicit Cast of: 3 3 Name: FA64 -
tests/meta/.expect/arch.x64.txt
rebf8ca5 r23a08aa0 1 meta/arch Vast.cfa:28:1 error: Cannot choose between 3 alternatives for expression1 meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression 2 2 Explicit Cast of: 3 3 Name: FX64 -
tests/meta/.expect/arch.x86.txt
rebf8ca5 r23a08aa0 1 meta/arch Vast.cfa:28:1 error: Cannot choose between 3 alternatives for expression1 meta/arch.cfa:28:1 error: Cannot choose between 3 alternatives for expression 2 2 Explicit Cast of: 3 3 Name: FX86 -
tests/meta/arch.cfa
rebf8ca5 r23a08aa0 5 5 // file "LICENCE" distributed with Cforall. 6 6 // 7 // arch Vast.cfa -- Check if all combinations are of ast/arch are properly distinguished7 // arch.cfa -- Check if all architectures are properly distinguished by the test suite 8 8 // 9 9 // Author : Thierry Delisle -
tests/pybin/settings.py
rebf8ca5 r23a08aa0 97 97 self.path = "debug" if value else "nodebug" 98 98 99 class AST:100 def __init__(self, ast):101 if ast == "new":102 self.target = ast103 self.string = "New AST"104 self.flags = """AST_FLAGS=-XCFA,--new-ast"""105 elif ast == "old":106 self.target = ast107 self.string = "Old AST"108 self.flags = """AST_FLAGS=-XCFA,--old-ast"""109 elif ast == None:110 self.target = "new" if config.NEWAST else "old"111 self.string = "Default AST (%s)" % self.target112 self.flags = """AST_FLAGS="""113 else:114 print("""ERROR: Invalid ast configuration, must be "old", "new" or left unspecified, was %s""" % (value), file=sys.stderr)115 sys.exit(1)116 117 def filter(self, tests):118 119 return [test for test in tests if not test.astv or self.target == test.astv]120 121 99 class Install: 122 100 def __init__(self, value): … … 141 119 142 120 def init( options ): 143 global all_ast144 121 global all_arch 145 122 global all_debug 146 123 global all_install 147 global ast148 124 global arch 149 125 global debug … … 160 136 global timeout2gdb 161 137 162 all_ast = [AST(o) for o in list(dict.fromkeys(options.ast ))] if options.ast else [AST(None)]163 138 all_arch = [Architecture(o) for o in list(dict.fromkeys(options.arch ))] if options.arch else [Architecture(None)] 164 139 all_debug = [Debug(o) for o in list(dict.fromkeys(options.debug ))] -
tests/pybin/test_run.py
rebf8ca5 r23a08aa0 11 11 self.path = '' 12 12 self.arch = '' 13 self.astv = ''14 13 15 14 def toString(self): 16 return "{:25s} ({:5s} arch , {:s} ast: {:s})".format( self.name, self.arch if self.arch else "Any", self.astv if self.astvelse "Any", self.target() )15 return "{:25s} ({:5s} arch: {:s})".format( self.name, self.arch if self.arch else "Any", self.target() ) 17 16 18 17 def prepare(self): … … 22 21 def expect(self): 23 22 arch = '' if not self.arch else ".%s" % self.arch 24 astv = '' if not self.astv else ".nast" if self.astv == "new" else ".oast" 25 return os.path.normpath( os.path.join(settings.SRCDIR , self.path, ".expect", "%s%s%s.txt" % (self.name,astv,arch)) ) 23 return os.path.normpath( os.path.join(settings.SRCDIR , self.path, ".expect", "%s%s.txt" % (self.name,arch)) ) 26 24 27 25 def error_log(self): … … 58 56 59 57 @staticmethod 60 def new_target(target, arch , astv):58 def new_target(target, arch): 61 59 test = Test() 62 60 test.name = os.path.basename(target) 63 61 test.path = os.path.relpath (os.path.dirname(target), settings.SRCDIR) 64 62 test.arch = arch.target if arch else '' 65 test.astv = astv.target if astv else ''66 63 return test 67 64 -
tests/pybin/tools.py
rebf8ca5 r23a08aa0 182 182 '-s' if silent else None, 183 183 test_param, 184 settings.ast.flags,185 184 settings.arch.flags, 186 185 settings.debug.flags, -
tests/quotedKeyword.cfa
rebf8ca5 r23a08aa0 31 31 ``__int128, ``__label__, ``long, ``lvalue, ``_Noreturn, ``__builtin_offsetof, ``otype, ``register, ``restrict, 32 32 ``__restrict, ``__restrict__, ``return, ``short, ``signed, ``__signed, ``__signed__, ``sizeof, ``static, 33 ``_Static_assert, ``struct, ``switch, ``_ Thread_local, ``throw, ``throwResume, ``trait, ``try, ``typedef,33 ``_Static_assert, ``struct, ``switch, ``_thread, ``_Thread_local, ``throw, ``throwResume, ``trait, ``try, ``typedef, 34 34 ``typeof, ``__typeof, ``__typeof__, ``union, ``unsigned, ``__builtin_va_list, ``void, ``volatile, ``__volatile, 35 35 ``__volatile__, ``while; -
tests/test.py
rebf8ca5 r23a08aa0 23 23 24 24 def match_test(path): 25 match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\. nast|\.oast)?(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path)25 match = re.search("^%s\/([\w\/\-_]*).expect\/([\w\-_]+)(\.[\w\-_]+)?\.txt$" % settings.SRCDIR, path) 26 26 if match : 27 27 test = Test() 28 28 test.name = match.group(2) 29 29 test.path = match.group(1) 30 test.arch = match.group(4)[1:] if match.group(4) else None 31 32 astv = match.group(3)[1:] if match.group(3) else None 33 if astv == 'oast': 34 test.astv = 'old' 35 elif astv == 'nast': 36 test.astv = 'new' 37 elif astv: 38 print('ERROR: "%s", expect file has astv but it is not "nast" or "oast"' % testname, file=sys.stderr) 39 sys.exit(1) 30 test.arch = match.group(3)[1:] if match.group(3) else None 40 31 41 32 expected.append(test) … … 81 72 # this is a valid name, let's check if it already exists 82 73 found = [test for test in all_tests if canonical_path( test.target() ) == testname] 83 setup = itertools.product(settings.all_arch if options.arch else [None] , settings.all_ast if options.ast else [None])74 setup = itertools.product(settings.all_arch if options.arch else [None]) 84 75 if not found: 85 # it's a new name, create it according to the name and specified architecture /ast version86 tests.extend( [Test.new_target(testname, arch , ast) for arch, astin setup] )76 # it's a new name, create it according to the name and specified architecture 77 tests.extend( [Test.new_target(testname, arch) for arch in setup] ) 87 78 elif len(found) == 1 and not found[0].arch: 88 79 # we found a single test, the user better be wanting to create a cross platform test 89 80 if options.arch: 90 81 print('ERROR: "%s", test has no specified architecture but --arch was specified, ignoring it' % testname, file=sys.stderr) 91 elif options.ast:92 print('ERROR: "%s", test has no specified ast version but --ast was specified, ignoring it' % testname, file=sys.stderr)93 82 else: 94 83 tests.append( found[0] ) 95 84 else: 96 85 # this test is already cross platform, just add a test for each platform the user asked 97 tests.extend( [Test.new_target(testname, arch , ast) for arch, astin setup] )86 tests.extend( [Test.new_target(testname, arch) for arch in setup] ) 98 87 99 88 # print a warning if it users didn't ask for a specific architecture … … 102 91 print('WARNING: "%s", test has architecture specific expected files but --arch was not specified, regenerating only for current host' % testname, file=sys.stderr) 103 92 104 105 # print a warning if it users didn't ask for a specific ast version106 found_astv = [f.astv for f in found if f.astv]107 if found_astv and not options.ast:108 print('WARNING: "%s", test has ast version specific expected files but --ast was not specified, regenerating only for current ast' % testname, file=sys.stderr)109 110 93 else : 111 94 print('ERROR: "%s", tests are not allowed to end with a C/C++/CFA extension, ignoring it' % testname, file=sys.stderr) … … 127 110 # create a parser with the arguments for the tests script 128 111 parser = argparse.ArgumentParser(description='Script which runs cforall tests') 129 parser.add_argument('--ast', help='Test for specific ast', type=comma_separated(str), default=None)130 112 parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None) 131 113 parser.add_argument('--debug', help='Run all tests in debug or release', type=comma_separated(yes_no), default='yes') … … 351 333 352 334 # print the possible options 353 print("-h --help --debug --dry-run --list --a st=new --ast=old --arch --all --regenerate-expected --archive-errors --install --timeout --global-timeout --timeout-with-gdb -j --jobs -I --include -E --exclude --continue ", end='')335 print("-h --help --debug --dry-run --list --arch --all --regenerate-expected --archive-errors --install --timeout --global-timeout --timeout-with-gdb -j --jobs -I --include -E --exclude --continue ", end='') 354 336 print(" ".join(map(lambda t: "%s" % (t.target()), tests))) 355 337 … … 422 404 # for each build configurations, run the test 423 405 with Timed() as total_dur: 424 for ast, arch, debug, install in itertools.product(settings.all_ast, settings.all_arch, settings.all_debug, settings.all_install): 425 settings.ast = ast 406 for arch, debug, install in itertools.product(settings.all_arch, settings.all_debug, settings.all_install): 426 407 settings.arch = arch 427 408 settings.debug = debug … … 430 411 # filter out the tests for a different architecture 431 412 # tests are the same across debug/install 432 local_tests = settings.ast.filter( tests ) 433 local_tests = settings.arch.filter( local_tests ) 413 local_tests = settings.arch.filter( tests ) 434 414 435 415 # check the build configuration works … … 438 418 439 419 # print configuration 440 print('%s %i tests on %i cores (%s :%s- %s)' % (420 print('%s %i tests on %i cores (%s - %s)' % ( 441 421 'Regenerating' if settings.generating else 'Running', 442 422 len(local_tests), 443 423 jobs, 444 settings.ast.string,445 424 settings.arch.string, 446 425 settings.debug.string
Note:
See TracChangeset
for help on using the changeset viewer.