Changeset b01d459


Ignore:
Timestamp:
Jun 27, 2022, 1:53:47 PM (5 months ago)
Author:
caparsons <caparson@…>
Branches:
master, pthread-emulation, qualifiedEnum
Children:
7b71402
Parents:
fd365da (diff), e4ea4b8d (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

Files:
6 added
1 deleted
15 edited

Legend:

Unmodified
Added
Removed
  • doc/theses/thierry_delisle_PhD/thesis/.gitignore

    rfd365da rb01d459  
    11back_text/
     2SAVE.fig
  • doc/theses/thierry_delisle_PhD/thesis/Makefile

    rfd365da rb01d459  
    4040        emptytls \
    4141        emptytree \
     42        executionStates \
    4243        fairness \
    4344        idle \
     
    4748        io_uring \
    4849        pivot_ring \
     50        MQMS \
     51        MQMSG \
    4952        system \
    5053        cycle \
     
    6568        result.memcd.rate.qps \
    6669        result.memcd.rate.99th \
     70        SQMS \
    6771}
    6872
  • doc/theses/thierry_delisle_PhD/thesis/fig/system.fig

    rfd365da rb01d459  
    4949         7800 3750 8025 3750
    5050-6
     516 4125 4725 4950 4950
     521 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4250 4838 100 100 4250 4838 4350 4838
     534 0 -1 0 0 0 12 0.0000 2 135 510 4425 4875 thread\001
     54-6
     556 5175 4725 6300 4950
     562 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
     57         5400 4950 5400 4725 5175 4725 5175 4950 5400 4950
     584 0 -1 0 0 0 12 0.0000 2 135 765 5475 4875 processor\001
     59-6
     606 6600 4725 7500 4950
     612 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
     62         6825 4950 6600 4950 6600 4725 6825 4725 6825 4950
     634 0 -1 0 0 0 12 0.0000 2 135 540 6900 4875 cluster\001
     64-6
     656 2175 4725 3975 4950
     661 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830
     674 0 -1 0 0 0 12 0.0000 2 180 1605 2325 4875 generator/coroutine\001
     68-6
     696 1575 2550 2775 3900
     702 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
     71         2400 3450 2400 3000 1950 3000 1950 3450 2400 3450
     724 1 -1 0 0 0 12 0.0000 2 135 1170 2175 2700 Discrete-event\001
     734 1 -1 0 0 0 12 0.0000 2 180 720 2175 2925 Manager\001
     744 1 -1 0 0 0 12 0.0000 2 180 930 2175 3675 preemption\001
     754 1 -1 0 0 0 12 0.0000 2 135 630 2175 3900 timeout\001
     76-6
    51771 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 2625 150 150 5550 2625 5700 2625
    52781 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5550 3225 150 150 5550 3225 5700 3225
     
    62881 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 2850 150 150 3975 2850 4125 2850
    63891 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 7200 2775 150 150 7200 2775 7350 2775
    64 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830
    65901 3 0 1 0 0 0 0 0 0.000 1 0.0000 7200 2775 30 30 7200 2775 7230 2805
    66911 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 3600 150 150 3525 3600 3675 3600
    67 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4625 4838 100 100 4625 4838 4725 4838
    68 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
    69          2400 4200 2400 3750 1950 3750 1950 4200 2400 4200
    70922 2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5
    7193         6300 4500 6300 1800 3000 1800 3000 4500 6300 4500
     
    135157        1 1 1.00 45.00 90.00
    136158         7875 3750 7875 2325 7200 2325 7200 2550
    137 2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
    138          6975 4950 6750 4950 6750 4725 6975 4725 6975 4950
    139 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
    140          5850 4950 5850 4725 5625 4725 5625 4950 5850 4950
    141 4 1 -1 0 0 0 10 0.0000 2 135 900 5550 4425 Processors\001
    142 4 1 -1 0 0 0 10 0.0000 2 165 1170 4200 3975 Ready Threads\001
    143 4 1 -1 0 0 0 10 0.0000 2 165 1440 7350 1725 Other Cluster(s)\001
    144 4 1 -1 0 0 0 10 0.0000 2 135 1080 4650 1725 User Cluster\001
    145 4 1 -1 0 0 0 10 0.0000 2 165 630 2175 3675 Manager\001
    146 4 1 -1 0 0 0 10 0.0000 2 135 1260 2175 3525 Discrete-event\001
    147 4 1 -1 0 0 0 10 0.0000 2 150 900 2175 4350 preemption\001
    148 4 0 -1 0 0 0 10 0.0000 2 135 630 7050 4875 cluster\001
    149 4 1 -1 0 0 0 10 0.0000 2 135 1350 4200 3225 Blocked Threads\001
    150 4 0 -1 0 0 0 10 0.0000 2 135 540 4800 4875 thread\001
    151 4 0 -1 0 0 0 10 0.0000 2 120 810 5925 4875 processor\001
    152 4 0 -1 0 0 0 10 0.0000 2 165 1710 2325 4875 generator/coroutine\001
     1594 1 -1 0 0 0 12 0.0000 2 135 840 5550 4425 Processors\001
     1604 1 -1 0 0 0 12 0.0000 2 180 1215 4200 3975 Ready Threads\001
     1614 1 -1 0 0 0 12 0.0000 2 165 1275 7350 1725 Other Cluster(s)\001
     1624 1 -1 0 0 0 12 0.0000 2 135 990 4650 1725 User Cluster\001
     1634 1 -1 0 0 0 12 0.0000 2 135 1380 4200 3225 Blocked Threads\001
  • doc/theses/thierry_delisle_PhD/thesis/text/existing.tex

    rfd365da rb01d459  
    11\chapter{Previous Work}\label{existing}
    2 Scheduling is the process of assigning resources to incomming requests.
    3 A very common form of this is assigning available workers to work-requests.
    4 The need for scheduling is very common in Computer Science, \eg Operating Systems and Hypervisors schedule available CPUs, NICs schedule available bamdwith, but scheduling is also common in other fields.
    5 For example, in assmebly lines assigning parts in need of assembly to line workers is a form of scheduling.
     2As stated, scheduling is the process of assigning resources to incoming requests, where the common example is assigning available workers to work requests or vice versa.
     3Common scheduling examples in Computer Science are: operating systems and hypervisors schedule available CPUs, NICs schedule available bandwidth, virtual memory and memory allocator schedule available storage, \etc.
     4Scheduling is also common in most other fields, \eg in assembly lines, assigning parts to line workers is a form of scheduling.
    65
    7 In all these cases, the choice of a scheduling algorithm generally depends first and formost on how much information is available to the scheduler.
    8 Workloads that are well-kown, consistent and homegenous can benefit from a scheduler that is optimized to use this information while ill-defined inconsistent heterogenous workloads will require general algorithms.
    9 A secondary aspect to that is how much information can be gathered versus how much information must be given as part of the input.
    10 There is therefore a spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limitted information.
    11 Note that this description includes both infomation about each requests, \eg time to complete or resources needed, and information about the relationships between request, \eg whether or not some request must be completed before another request starts.
     6In general, \emph{selecting} a scheduling algorithm depends on how much information is available to the scheduler.
     7Workloads that are well-known, consistent, and homogeneous can benefit from a scheduler that is optimized to use this information, while ill-defined, inconsistent, heterogeneous workloads require general non-optimal algorithms.
     8A secondary aspect is how much information can be gathered versus how much information must be given as part of the scheduler input.
     9This information adds to the spectrum of scheduling algorithms, going from static schedulers that are well informed from the start, to schedulers that gather most of the information needed, to schedulers that can only rely on very limited information.
     10Note, this description includes both information about each requests, \eg time to complete or resources needed, and information about the relationships among request, \eg whether or not some request must be completed before another request starts.
    1211
    13 Scheduling physical resources, for example in assembly lines, is generally amenable to using very well informed scheduling since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time.
     12Scheduling physical resources, \eg in an assembly line, is generally amenable to using well-informed scheduling, since information can be gathered much faster than the physical resources can be assigned and workloads are likely to stay stable for long periods of time.
    1413When a faster pace is needed and changes are much more frequent gathering information on workloads, up-front or live, can become much more limiting and more general schedulers are needed.
    1514
    1615\section{Naming Convention}
    17 Scheduling has been studied by various different communities concentrating on different incarnation of the same problems. As a result, their is no real naming convention for scheduling that is respected across these communities. For this document, I will use the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the objects which will execute these \glspl{at}.
     16Scheduling has been studied by various communities concentrating on different incarnation of the same problems. As a result, there is no standard naming conventions for scheduling that is respected across these communities. This document uses the term \newterm{\Gls{at}} to refer to the abstract objects being scheduled and the term \newterm{\Gls{proc}} to refer to the concrete objects executing these \glspl{at}.
    1817
    1918\section{Static Scheduling}
    20 Static schedulers require that \glspl{at} have their dependencies and costs explicitly and exhaustively specified prior schedule.
    21 The scheduler then processes this input ahead of time and producess a \newterm{schedule} to which the system can later adhere.
    22 This approach is generally popular in real-time systems since the need for strong guarantees justifies the cost of supplying this information.
    23 In general, static schedulers are less relavant to this project since they require input from the programmers that \CFA does not have as part of its concurrency semantic.
    24 Specifying this information explicitly can add a significant burden on the programmers and reduces flexibility, for this reason the \CFA scheduler does not require this information.
    25 
     19\newterm{Static schedulers} require \gls{at} dependencies and costs be explicitly and exhaustively specified prior to scheduling.
     20The scheduler then processes this input ahead of time and produces a \newterm{schedule} the system follows during execution.
     21This approach is popular in real-time systems since the need for strong guarantees justifies the cost of determining and supplying this information.
     22In general, static schedulers are less relevant to this project because they require input from the programmers that a programming language does not have as part of its concurrency semantic.
     23Specifying this information explicitly adds a significant burden to the programmer and reduces flexibility.
     24For this reason, the \CFA scheduler does not require this information.
    2625
    2726\section{Dynamic Scheduling}
    28 It may be difficult to fulfill the requirements of static scheduler if dependencies are conditionnal. In this case, it may be preferable to detect dependencies at runtime. This detection effectively takes the form of adding one or more new \gls{at}(s) to the system as their dependencies are resolved. As well as potentially halting or suspending a \gls{at} that dynamically detect unfulfilled dependencies. Each \gls{at} has the responsability of adding the dependent \glspl{at} back in the system once completed. As a consequence, the scheduler may have an incomplete view of the system, seeing only \glspl{at} we no pending dependencies. Schedulers that support this detection at runtime are referred to as \newterm{Dynamic Schedulers}.
     27\newterm{Dynamic schedulers} determine \gls{at} dependencies and costs (if at all) during scheduling.
     28% Schedulers that support this detection at runtime are referred to as \newterm{Dynamic Schedulers}.
     29Hence, unlike static scheduling, \gls{at} dependencies are conditional and detected at runtime. This detection takes the form of observing new \gls{at}(s) in the system and determining dependencies from their behaviour, including suspending or halting a \gls{at} that dynamically detects unfulfilled dependencies. Furthermore, each \gls{at} has the responsibility of adding dependent \glspl{at} back into the system once dependencies are fulfilled. As a consequence, the scheduler often has an incomplete view of the system, seeing only \glspl{at} with no pending dependencies.
    2930
    3031\subsection{Explicitly Informed Dynamic Schedulers}
    31 While dynamic schedulers do not have access to an exhaustive list of dependencies for a \gls{at}, they may require to provide more or less information about each \gls{at}, including for example: expected duration, required ressources, relative importance, etc. The scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information} Precisely providing this information can be difficult for programmers, especially \emph{predicted} behaviour, and the scheduler may need to support some amount of imprecision in the provided information. For example, specifying that a \glspl{at} takes approximately 5 seconds to complete, rather than exactly 5 seconds. User provided information can also become a significant burden depending how the effort to provide the information scales with the number of \glspl{at} and there complexity. For example, providing an exhaustive list of files read by 5 \glspl{at} is an easier requirement the providing an exhaustive list of memory addresses accessed by 10'000 distinct \glspl{at}.
     32While dynamic schedulers may not have an exhaustive list of dependencies for a \gls{at}, some information may be available about each \gls{at}, \eg expected duration, required resources, relative importance, \etc. When available, a scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information} However, most programmers do not determine or even \emph{predict} this information;
     33at best, the scheduler has only some imprecision information provided by the programmer, \eg, indicating a \glspl{at} takes approximately 3--7 seconds to complete, rather than exactly 5 seconds. Providing this kind of information is a significant programmer burden especially if the the information does not scale with the number of \glspl{at} and their complexity. For example, providing an exhaustive list of files read by 5 \glspl{at} is an easier requirement then providing an exhaustive list of memory addresses accessed by 10,000 independent \glspl{at}.
    3234
    33 Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, Explicitly Informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentionnding.
     35Since the goal of this thesis is to provide an \emph{informed} scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, explicitly informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentioning.
    3436
    35 \subsubsection{Prority Scheduling}
    36 A commonly used information that schedulers used to direct the algorithm is priorities. Each Task is given a priority and higher-priority \glspl{at} are preferred to lower-priority ones. The simplest priority scheduling algorithm is to simply require that every \gls{at} have a distinct pre-established priority and always run the available \gls{at} with the highest priority. Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \glspl{at}. It can therefore be diserable for schedulers to support \glspl{at} with identical priorities and/or automatically setting and adjusting priorites for \glspl{at}. The most common operating some variation on priorities with overlaps and dynamic priority adjustments. For example, Microsoft Windows uses a pair of priorities
     37\subsubsection{Priority Scheduling}
     38Common information used by schedulers to direct their algorithm is priorities. Each task is given a priority and higher-priority \glspl{at} are preferred to lower-priority ones. The simplest priority scheduling algorithm is to require that every \gls{at} have a distinct pre-established priority and always run the available \gls{at} with the highest priority. Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of \glspl{at}. It can therefore be desirable for schedulers to support \glspl{at} with identical priorities and/or automatically setting and adjusting priorities for \glspl{at}. The most common adopting some variant on priorities with overlaps and dynamic priority adjustments. For example, Microsoft Windows uses a pair of priorities
    3739\cit{https://docs.microsoft.com/en-us/windows/win32/procthread/scheduling-priorities,https://docs.microsoft.com/en-us/windows/win32/taskschd/taskschedulerschema-priority-settingstype-element}, one specified by users out of ten possible options and one adjusted by the system.
    3840
    3941\subsection{Uninformed and Self-Informed Dynamic Schedulers}
    40 Several scheduling algorithms do not require programmers to provide additionnal information on each \gls{at}, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
     42Several scheduling algorithms do not require programmers to provide additional information on each \gls{at}, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
    4143
    4244
    4345\subsubsection{Feedback Scheduling}
    44 As mentionned, Schedulers may also gather information about each \glspl{at} to direct their decisions. This design effectively moves the scheduler to some extent into the realm of \newterm{Control Theory}\cite{wiki:controltheory}. This gathering does not generally involve programmers and as such does not increase programmer burden the same way explicitly provided information may. However, some feedback schedulers do offer the option to programmers to offer additionnal information on certain \glspl{at}, in order to direct scheduling decision. The important distinction being whether or not the scheduler can function without this additionnal information.
     46As mentioned, schedulers may also gather information about each \glspl{at} to direct their decisions. This design effectively moves the scheduler into the realm of \newterm{Control Theory}~\cite{wiki:controltheory}. This information gathering does not generally involve programmers, and as such, does not increase programmer burden the same way explicitly provided information may. However, some feedback schedulers do allow programmers to offer additional information on certain \glspl{at}, in order to direct scheduling decisions. The important distinction being whether or not the scheduler can function without this additional information.
    4547
    4648
    4749\section{Work Stealing}\label{existing:workstealing}
    48 One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work-stealing. This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker work on its local \glspl{at} first, but allows the possibility for other workers to steal local \glspl{at} if they run out of \glspl{at}. \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has queue of \glspl{at} to accomplish and workers without \glspl{at} steal \glspl{at} from random workers. (The Burton and Sleep algorithm had trees of \glspl{at} and stole only among neighbours). Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations.
     50One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work stealing. This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker process its local \glspl{at} first, but allows the possibility for other workers to steal local \glspl{at} if they run out of \glspl{at}. \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has a queue of \glspl{at} and workers without \glspl{at} steal \glspl{at} from random workers. (The Burton and Sleep algorithm had trees of \glspl{at} and steal only among neighbours). Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations.
    4951
    50 Many variations of this algorithm have been proposed over the years\cite{DBLP:journals/ijpp/YangH18}, both optmizations of existing implementations and approaches that account for new metrics.
     52Many variations of this algorithm have been proposed over the years~\cite{DBLP:journals/ijpp/YangH18}, both optimizations of existing implementations and approaches that account for new metrics.
    5153
    52 \paragraph{Granularity} A significant portion of early Work Stealing research was concentrating on \newterm{Implicit Parellelism}\cite{wiki:implicitpar}. Since the system was responsible to split the work, granularity is a challenge that cannot be left to the programmers (as opposed to \newterm{Explicit Parellelism}\cite{wiki:explicitpar} where the burden can be left to programmers). In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead. The best performance generally means finding a middle ground between the two. Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained.
     54\paragraph{Granularity} A significant portion of early work-stealing research concentrated on \newterm{Implicit Parallelism}~\cite{wiki:implicitpar}. Since the system is responsible for splitting the work, granularity is a challenge that cannot be left to programmers (as opposed to \newterm{Explicit Parallelism}\cite{wiki:explicitpar} where the burden can be left to programmers). In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead. The best performance generally means finding a middle ground between the two. Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained.
    5355
    5456\paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating \glspl{at} from one core to another can be .  \cite{DBLP:journals/tpds/SquillanteL93}
     
    5658\todo{The survey is not great on this subject}
    5759
    58 \paragraph{Complex Machine Architecture} Another aspect that has been looked at is how well Work Stealing is applicable to different machine architectures.
     60\paragraph{Complex Machine Architecture} Another aspect that has been examined is how well work stealing is applicable to different machine architectures.
    5961
    6062\subsection{Theoretical Results}
    61 There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogenous systems\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}. \cite{DBLP:journals/jacm/BlellochGM99} examine the space bounds of Work Stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} show that for underloaded systems, the scheduler will complete computations in finite time, \ie is \newterm{stable}. Others show that Work-Stealing is applicable to various scheduling contexts\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}. \cite{DBLP:conf/ipps/ColeR13} also studied how Randomized Work Stealing affects false sharing among \glspl{at}.
     63There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration~\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance~\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogeneous systems~\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}. \cite{DBLP:journals/jacm/BlellochGM99} examines the space bounds of work stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} shows that for under-loaded systems, the scheduler completes its computations in finite time, \ie is \newterm{stable}. Others show that work stealing is applicable to various scheduling contexts~\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}. \cite{DBLP:conf/ipps/ColeR13} also studied how randomized work-stealing affects false sharing among \glspl{at}.
    6264
    63 However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentionning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a Direct Acyclic Graph. It is unclear how well these distributions represent workloads in real world scenarios.
     65However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentioning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a direct acyclic graph. It is unclear how well these distributions represent workloads in real world scenarios.
    6466
    6567\section{Preemption}
    66 One last aspect of scheduling worth mentionning is preemption since many schedulers rely on it for some of their guarantees. Preemption is the idea of interrupting \glspl{at} that have been running for too long, effectively injecting suspend points in the applications. There are multiple techniques to achieve this but they all aim to have the effect of guaranteeing that suspend points in a \gls{at} are never further apart than some fixed duration. While this helps schedulers guarantee that no \glspl{at} will unfairly monopolize a worker, preemption can effectively added to any scheduler. Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it.
     68One last aspect of scheduling is preemption since many schedulers rely on it for some of their guarantees. Preemption is the idea of interrupting \glspl{at} that have been running too long, effectively injecting suspend points into the application. There are multiple techniques to achieve this effect but they all aim to guarantee suspend points in a \gls{at} are never further apart than some fixed duration. While this helps schedulers guarantee that no \glspl{at} unfairly monopolizes a worker, preemption can effectively added to any scheduler. Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it.
    6769
    68 \section{Schedulers in Production}\label{existing:prod}
    69 This section will show a quick overview of several schedulers which are generally available a the time of writing. While these schedulers don't necessarily represent to most recent advances in scheduling, they are what is generally accessible to programmers. As such, I believe that these schedulers are at least as relevant as those presented in published work. I chose both schedulers that operating in kernel space and in user space, as both can offer relevant insight for this project. However, I did not list any schedulers aimed for real-time applications, as these have constraints that are much stricter than what is needed for this project.
     70\section{Production Schedulers}\label{existing:prod}
     71This section presents a quick overview of several current schedulers. While these schedulers do not necessarily represent the most recent advances in scheduling, they are what is generally accessible to programmers. As such, I believe these schedulers are at least as relevant as those presented in published work. Schedulers that operate in kernel space and in user space are considered, as both can offer relevant insight for this project. However, real-time schedulers as not considered, as these have constraints that are much stricter than what is needed for this project.
    7072
    7173\subsection{Operating System Schedulers}
    72 Operating System Schedulers tend to be fairly complex schedulers, they generally support some amount of real-time, aim to balance interactive and non-interactive \glspl{at} and support for multiple users sharing hardware without requiring these users to cooperate. Here are more details on a few schedulers used in the common operating systems: Linux, FreeBsd, Microsoft Windows and Apple's OS X. The information is less complete for operating systems behind closed source.
     74Operating System Schedulers tend to be fairly complex as they generally support some amount of real-time, aim to balance interactive and non-interactive \glspl{at} and support multiple users sharing hardware without requiring these users to cooperate. Here are more details on a few schedulers used in the common operating systems: Linux, FreeBSD, Microsoft Windows and Apple's OS X. The information is less complete for operating systems with closed source.
    7375
    7476\paragraph{Linux's CFS}
    75 The default scheduler used by Linux (the Completely Fair Scheduler)\cite{MAN:linux/cfs,MAN:linux/cfs2} is a feedback scheduler based on CPU time. For each processor, it constructs a Red-Black tree of \glspl{at} waiting to run, ordering them by amount of CPU time spent. The scheduler schedules the \gls{at} that has spent the least CPU time. It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time spent. The ordering of \glspl{at} is also impacted by a group based notion of fairness, where \glspl{at} belonging to groups having spent less CPU time are preferred to \glspl{at} beloning to groups having spent more CPU time. Linux achieves load-balancing by regularly monitoring the system state\cite{MAN:linux/cfs/balancing} and using some heuristic on the load (currently CPU time spent in the last millisecond plus decayed version of the previous time slots\cite{MAN:linux/cfs/pelt}.).
     77The default scheduler used by Linux (the Completely Fair Scheduler)~\cite{MAN:linux/cfs,MAN:linux/cfs2} is a feedback scheduler based on CPU time. For each processor, it constructs a Red-Black tree of \glspl{at} waiting to run, ordering them by the amount of CPU time used. The \gls{at} that has used the least CPU time is scheduled. It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time used. The ordering of \glspl{at} is also affected by a group based notion of fairness, where \glspl{at} belonging to groups having used less CPU time are preferred to \glspl{at} belonging to groups having used more CPU time. Linux achieves load-balancing by regularly monitoring the system state~\cite{MAN:linux/cfs/balancing} and using some heuristic on the load (currently CPU time used in the last millisecond plus a decayed version of the previous time slots~\cite{MAN:linux/cfs/pelt}.).
    7678
    77 \cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work-stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly. The issues highlighted sem to stem from Linux's need to support fairness across \glspl{at} \emph{and} across users\footnote{Enforcing fairness across users means, for example, that given two users: one with a single \gls{at} and the other with one thousand \glspl{at}, the user with a single \gls{at} does not receive one one thousandth of the CPU time.}, increasing the complexity.
     79\cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly. The issues highlighted stem from Linux's need to support fairness across \glspl{at} \emph{and} across users\footnote{Enforcing fairness across users means that given two users, one with a single \gls{at} and the other with one thousand \glspl{at}, the user with a single \gls{at} does not receive one thousandth of the CPU time.}, increasing the complexity.
    7880
    79 Linux also offers a FIFO scheduler, a real-time schedulerwhich runs the highest-priority \gls{at}, and a round-robin scheduler, which is an extension of the fifo-scheduler that adds fixed time slices. \cite{MAN:linux/sched}
     81Linux also offers a FIFO scheduler, a real-time scheduler, which runs the highest-priority \gls{at}, and a round-robin scheduler, which is an extension of the FIFO-scheduler that adds fixed time slices. \cite{MAN:linux/sched}
    8082
    8183\paragraph{FreeBSD}
    82 The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS. It uses different data structures and heuristics but also schedules according to some combination of CPU time spent and niceness values. It also periodically balances the load of the system(according to a different heuristic), but uses a simpler Work Stealing approach.
     84The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS. It uses different data structures and heuristics but also schedules according to some combination of CPU time used and niceness values. It also periodically balances the load of the system (according to a different heuristic), but uses a simpler work stealing approach.
    8385
    8486\paragraph{Windows(OS)}
    85 Microsoft's Operating System's Scheduler\cite{MAN:windows/scheduler} is a feedback scheduler with priorities. It supports 32 levels of priorities, some of which are reserved for real-time and prviliged applications. It schedules \glspl{at} based on the highest priorities (lowest number) and how much cpu time each \glspl{at} have used. The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests.
     87Microsoft's Operating System's Scheduler~\cite{MAN:windows/scheduler} is a feedback scheduler with priorities. It supports 32 levels of priorities, some of which are reserved for real-time and privileged applications. It schedules \glspl{at} based on the highest priorities (lowest number) and how much CPU time each \gls{at} has used. The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests.
    8688
    8789\todo{load balancing}
     
    101103\subsection{User-Level Schedulers}
    102104By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all \glspl{at} have the same user, and therefore cooperation is both feasible and probable.
    103 \paragraph{Go}
    104 Go's scheduler uses a Randomized Work Stealing algorithm that has a global runqueue(\emph{GRQ}) and each processor(\emph{P}) has both a fixed-size runqueue(\emph{LRQ}) and a high-priority next ``chair'' holding a single element.\cite{GITHUB:go,YTUBE:go} Preemption is present, but only at function call boundaries.
     105
     106\paragraph{Go}\label{GoSafePoint}
     107Go's scheduler uses a randomized work-stealing algorithm that has a global run-queue (\emph{GRQ}) and each processor (\emph{P}) has both a fixed-size run-queue (\emph{LRQ}) and a high-priority next ``chair'' holding a single element~\cite{GITHUB:go,YTUBE:go}. Preemption is present, but only at safe-points,~\cit{https://go.dev/src/runtime/preempt.go} which are inserted detection code at various frequent access boundaries.
    105108
    106109The algorithm is as follows :
    107110\begin{enumerate}
    108         \item Once out of 61 times, directly pick 1 element from the \emph{GRQ}.
     111        \item Once out of 61 times, pick 1 element from the \emph{GRQ}.
    109112        \item If there is an item in the ``chair'' pick it.
    110113        \item Else pick an item from the \emph{LRQ}.
    111         \item If it was empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ}.
    112         \item If it was empty steal \emph{half} the \emph{LRQ} of another \emph{P} chosen randomly.
     114        \begin{itemize}
     115        \item If it is empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ}
     116        \item and steal \emph{half} the \emph{LRQ} of another \emph{P} chosen randomly.
     117        \end{itemize}
    113118\end{enumerate}
    114119
    115120\paragraph{Erlang}
    116 Erlang is a functionnal language that supports concurrency in the form of processes, threads that share no data. It seems to be some kind of Round-Robin Scheduler. It currently uses some mix of Work Sharing and Work Stealing to achieve load balancing\cite{:erlang}, where underloaded workers steal from other workers, but overloaded workers also push work to other workers. This migration logic seems to be directed by monitoring logic that evaluates the load a few times per seconds.
     121Erlang is a functional language that supports concurrency in the form of processes: threads that share no data. It uses a kind of round-robin scheduler, with a mix of work sharing and stealing to achieve load balancing~\cite{:erlang}, where under-loaded workers steal from other workers, but overloaded workers also push work to other workers. This migration logic is directed by monitoring logic that evaluates the load a few times per seconds.
    117122
    118123\paragraph{Intel\textregistered ~Threading Building Blocks}
    119 \newterm{Thread Building Blocks}(TBB) is Intel's task parellelism\cite{wiki:taskparallel} framework. It runs \newterm{jobs}, uninterruptable \glspl{at}, schedulable objects that must always run to completion, on a pool of worker threads. TBB's scheduler is a variation of Randomized Work Stealing that also supports higher-priority graph-like dependencies\cite{MAN:tbb/scheduler}. It schedules \glspl{at} as follows (where \textit{t} is the last \gls{at} completed):
     124\newterm{Thread Building Blocks} (TBB) is Intel's task parallelism \cite{wiki:taskparallel} framework. It runs \newterm{jobs}, which are uninterruptable \glspl{at} that must always run to completion, on a pool of worker threads. TBB's scheduler is a variation of randomized work-stealing that also supports higher-priority graph-like dependencies~\cite{MAN:tbb/scheduler}. It schedules \glspl{at} as follows (where \textit{t} is the last \gls{at} completed):
    120125\begin{displayquote}
    121126        \begin{enumerate}
    122127                \item The task returned by \textit{t}\texttt{.execute()}
    123128                \item The successor of t if \textit{t} was its last completed predecessor.
    124                 \item A task popped from the end of the threads own deque.
     129                \item A task popped from the end of the thread's own deque.
    125130                \item A task with affinity for the thread.
    126131                \item A task popped from approximately the beginning of the shared queue.
    127                 \item A task popped from the beginning of another randomly chosen threads deque.
     132                \item A task popped from the beginning of another randomly chosen thread's deque.
    128133        \end{enumerate}
    129134
     
    134139
    135140\paragraph{Quasar/Project Loom}
    136 Java has two projects that are attempting to introduce lightweight threading into java in the form of Fibers, Quasar\cite{MAN:quasar} and Project Loom\cite{MAN:project-loom}\footnote{It is unclear to me if these are distinct projects or not}. Both projects seem to be based on the \texttt{ForkJoinPool} in Java which appears to be a simple incarnation of Randomized Work Stealing\cite{MAN:java/fork-join}.
     141Java has two projects, Quasar~\cite{MAN:quasar} and Project Loom~\cite{MAN:project-loom}\footnote{It is unclear if these are distinct projects.}, that are attempting to introduce lightweight thread\-ing in the form of Fibers. Both projects seem to be based on the \texttt{ForkJoinPool} in Java, which appears to be a simple incarnation of randomized work-stealing~\cite{MAN:java/fork-join}.
    137142
    138143\paragraph{Grand Central Dispatch}
    139 This is an API produce by Apple\cit{Official GCD source} that offers task parellelism\cite{wiki:taskparallel}. Its distinctive aspect is that it uses multiple ``Dispatch Queues'', some of which are created by programmers. These queues each have their own local ordering guarantees, \eg \glspl{at} on queue $A$ are executed in \emph{FIFO} order.
     144An Apple\cit{Official GCD source} API that offers task parallelism~\cite{wiki:taskparallel}. Its distinctive aspect is multiple ``Dispatch Queues'', some of which are created by programmers.  Each queue has its own local ordering guarantees, \eg \glspl{at} on queue $A$ are executed in \emph{FIFO} order.
    140145
    141146\todo{load balancing and scheduling}
     
    143148% http://web.archive.org/web/20090920043909/http://images.apple.com/macosx/technology/docs/GrandCentral_TB_brief_20090903.pdf
    144149
    145 In terms of semantics, the Dispatch Queues seem to be very similar in semantics to Intel\textregistered ~TBB \texttt{execute()} and predecessor semantics. Where it would be possible to convert from one to the other.
     150In terms of semantics, the Dispatch Queues seem to be very similar in semantics to Intel\textregistered ~TBB \texttt{execute()} and predecessor semantics. % Where it would be possible to convert from one to the other.
    146151
    147152\paragraph{LibFibre}
    148 LibFibre\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developt at the University of Waterloo. Similarly to Go, it uses a variation of Work Stealing with a global queue that is higher priority than stealing. Unlock Go it does not have the high-priority next ``chair'' and does not use Randomized Work Stealing.
    149 
     153LibFibre~\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developed at the University of Waterloo. Similarly to Go, it uses a variation of work stealing with a global queue that is higher priority than stealing. Unlike Go, it does not have the high-priority next ``chair'' and does not use randomized work-stealing.
  • doc/theses/thierry_delisle_PhD/thesis/text/intro.tex

    rfd365da rb01d459  
    1 \chapter*{Introduction}\label{intro}
     1\chapter{Introduction}\label{intro}
    22\todo{A proper intro}
    33
    4 The C programming language~\cite{C11}
     4Any shared system needs scheduling.
     5Computer systems share multiple resources across many threads of execution, even on single user computers like a laptop.
     6Scheduling occurs at discreet points when there are transitions in a system.
     7For example, a thread cycles through the following transitions during its execution.
     8\begin{center}
     9\input{executionStates.pstex_t}
     10\end{center}
     11These \newterm{state transition}s are initiated in response to events (\Index{interrupt}s):
     12\begin{itemize}
     13\item
     14entering the system (new $\rightarrow$ ready)
     15\item
     16timer alarm for preemption (running $\rightarrow$ ready)
     17\item
     18long term delay versus spinning (running $\rightarrow$ blocked)
     19\item
     20blocking ends, \ie network or I/O completion (blocked $\rightarrow$ ready)
     21\item
     22normal completion or error, \ie segment fault (running $\rightarrow$ halted)
     23\end{itemize}
     24Key to scheduling is that a thread cannot bypass the ``ready'' state during a transition so the scheduler maintains complete control of the system.
     25
     26In detail, in a computer system with multiple processors and work units, there exists the problem of mapping work onto processors in an efficient manner, called \newterm{scheduling}.
     27These systems are normally \newterm{open}, meaning new work arrives from an external source or spawned from an existing work unit.
     28Scheduling is a zero-sum game as computer processors normally have a fixed, maximum number of cycles per unit time.
     29(Frequency scaling and turbot boost add a degree of complexity that can be ignored in this discussion without loss of generality.)
     30
     31On a computer system, the scheduler takes a sequence of work requests in the form of threads and attempts to complete the work, subject to performance objectives, such as resource utilization.
     32A general-purpose dynamic-scheduler for an open system cannot anticipate future work requests, so its performance is rarely optimal.
     33Even with complete knowledge of arrive order and work, an optimal solution is NP (bin packing).
     34However, schedulers do take advantage of regularities in work patterns to produce excellent solutions.
     35Nevertheless, schedulers are a series of compromises, occasionally with some static or dynamic tuning parameters to enhance specific patterns.
     36
     37When the workload exceeds the capacity of the processors, \ie work cannot be executed immediately, it is placed on a queue for subsequent service, called a \newterm{ready queue}.
     38(In real-life, a work unit (person) can \newterm{balk}, and leave the system rather than wait.)
     39Ready queues organize threads for scheduling, which indirectly organizes the work to be performed.
     40The structure of ready queues forms a spectrum.
     41At one end is the single-queue multi-server (SIMS) and at the other end is the multi-queue multi-server (MOMS).
     42\begin{center}
     43\begin{tabular}{l|l}
     44\multicolumn{1}{c|}{\textbf{SIMS}} & \multicolumn{1}{c}{\textbf{MOMS}} \\
     45\hline
     46\raisebox{0.5\totalheight}{\input{SQMS.pstex_t}} & \input{MQMSG.pstex_t}
     47\end{tabular}
     48\end{center}
     49(While \newterm{pipeline} organizations also exist, \ie chaining schedulers together, they do not provide an advantage in this context.)
     50
     51The three major optimization criteria for a scheduler are:
     52\begin{enumerate}[leftmargin=*]
     53\item
     54\newterm{load balancing}: available work is distributed so no processor is idle when work is available.
     55
     56\noindent
     57Eventual progress for each work unit is often an important consideration, \ie no starvation.
     58\item
     59\newterm{affinity}: processors access state through a complex memory hierarchy, so it is advantageous to keep a work unit's state on a single or closely bound set of processors.
     60
     61\noindent
     62Essentially, all multi-processor computers have non-uniform memory access (NUMB), with one or more quantized steps to access data at different levels (steps) in the memory hierarchy.
     63When a system has a large number of independently executing threads, affinity is impossible because of \newterm{thread churn}.
     64That is, threads must be scheduled on multiple processors to obtain high processors utilization because the number of threads $\ggg$ processors.
     65
     66\item
     67\newterm{contention}: safe access of shared objects by multiple processors requires mutual exclusion in the form of locking\footnote{
     68Lock-free data-structures is still locking because all forms of locking invoke delays.}
     69
     70\noindent
     71Locking cost and latency increases significantly with the number of processors accessing a shared object.
     72\end{enumerate}
     73
     74SIMS has perfect load-balancing but poor affinity and high contention by the processors, because of the single queue.
     75MOMS has poor load-balancing but perfect affinity and no contention, because each processor has its own queue.
     76Between these two schedulers are a host of options, \ie adding an optional global, shared ready-queue to MOMS.
     77Significant research effort has also looked at load sharing/stealing among queues, when a ready queue is too long or short, respectively.
     78These approaches attempt to perform better load-balancing at the cost of affinity and contention.
     79Load sharing/stealing schedulers attempt to push/pull work units to/from other ready queues
     80
     81Note, a computer system is often lightly loaded (or has no load);
     82any scheduler can handle this situation, \ie all schedulers are equal in this context.
     83As the workload increases, there is a point where some schedulers begin to perform better than others based on the above criteria.
     84A poorer scheduler might saturate for some reason and not be able to assign available work to idle processors, \ie processors are spinning when work is available.
     85
     86
     87\section{\CFA programming language}
     88
     89\todo{Brief discussion on \CFA features used in the thesis.}
    590
    691The \CFA programming language~\cite{cfa:frontpage,cfa:typesystem} extends the C programming language by adding modern safety and productivity features, while maintaining backwards compatibility. Among its productivity features, \CFA supports user-level threading~\cite{Delisle21} allowing programmers to write modern concurrent and parallel programs.
     
    994
    1095As a research project, this work builds exclusively on newer versions of the Linux operating-system and gcc/clang compilers. While \CFA is released, supporting older versions of Linux ($<$~Ubuntu 16.04) and gcc/clang compilers ($<$~gcc 6.0) is not a goal of this work.
     96
     97
     98\section{Contributions}
     99\label{s:Contributions}
     100
     101This work provides the following contributions in the area of user-level scheduling in an advanced programming-language runtime-system:
     102\begin{enumerate}[leftmargin=*]
     103\item
     104Guarantee no thread or set of threads can conspire to prevent other threads from executing.
     105\begin{itemize}
     106\item
     107There must exist some form of preemption to prevent CPU-bound threads from gaining exclusive access to one or more processors.
     108\item
     109There must be a scheduler guarantee that threads are executed after CPU-bound threads are preempted.
     110Otherwise, CPU-bound threads can immediately rescheduled, negating the preemption.
     111\end{itemize}
     112Hence, the runtime/scheduler provides a notion of fairness that is impossible for a programmer to violate.
     113\item
     114Provide comprehensive scheduling across all forms of preemption and blocking, where threads move from the running to ready or blocked states.
     115
     116Once a thread stops running, the processor executing it must be rescheduled, if possible.
     117Knowing what threads are in the ready state for scheduling is difficult because blocking operations complete asynchronous and with poor notification.
     118\item
     119Efficiently deal with unbalanced workloads among processors.
     120
     121Simpler to
     122\item
     123Efficiently deal with idle processors when there is less work than the available computing capacity.
     124\end{enumerate}
  • doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex

    rfd365da rb01d459  
    22This chapter presents an overview of the capabilities of the \CFA runtime prior to this thesis work.
    33
    4 \Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@. Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC). While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC. This model uses \glspl{kthrd} to achieve parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completely true due to primitives like \lstinline|futex|es, which have a significant portion of their logic in user space.}.
     4\section{C Threading}
     5
     6\Celeven introduced threading features, such the @_Thread_local@ storage class, and libraries @stdatomic.h@ and @threads.h@. Interestingly, almost a decade after the \Celeven standard, the most recent versions of gcc, clang, and msvc do not support the \Celeven include @threads.h@, indicating no interest in the C11 concurrency approach (possibly because of the recent effort to add concurrency to \CC). While the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}, as for \CC. This model uses \glspl{kthrd} to achieve parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. A consequence of this approach is that the kernel has a perfect view of every thread executing in the system\footnote{This is not completely true due to primitives like \lstinline|futex|es, which have a significant portion of their logic in user space.}.
    57
    68\section{M:N Threading}\label{prev:model}
     
    810Threading in \CFA is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, \ie programmers should be able to create a large number of \glspl{thrd} and switch among \glspl{thrd} liberally without many concerns for performance.
    911
    10 The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run.
     12The \CFA M:N threading models is implemented using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model: they represent an independent thread of execution with its own stack. The difference is that user-level threads do not have a corresponding object in the kernel; they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then chooses a different \gls{thrd} to run.
    1113
    1214\section{Clusters}
    13 \CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. It also opens the door to handling effects like NUMA, by pining clusters to a specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
     15\CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} are only scheduled onto \glspl{proc} in the same cluster and scheduling is done independently of other clusters. Figure~\ref{fig:system} shows an overview of the \CFA runtime, which allows programmers to tightly control parallelism. It also opens the door to handling effects like NUMA, by pinning clusters to a specific NUMA node\footnote{This capability is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for CPU masks.}.
    1416
    1517\begin{figure}
     
    1719                \input{system.pstex_t}
    1820        \end{center}
    19         \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.}
     21        \caption[Overview of the \CFA runtime]{Overview of the \CFA runtime \newline \Glspl{thrd} are scheduled inside a particular cluster and run on the \glspl{proc} that belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{proc} that lives outside any cluster and does not run \glspl{thrd}.}
    2022        \label{fig:system}
    2123\end{figure}
     
    2830
    2931\begin{quote}
    30 Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request. With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
     32Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for a response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, because the first \gls{thrd} is still ready to run and should be able to get CPU time to send the request. With M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} \emph{cannot} run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is in a synchronization deadlock\footnote{In this example, the deadlock could be resolved if the server sends unprompted messages to the client. However, this solution is neither general nor appropriate even in this simple case.}.
    3133\end{quote}
    3234
    33 Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, like \glslink{uthrding}{User-Level \emph{Threading}} blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations, which entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple \glsxtrshort{io} operations in parallel. This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. Executing \glsxtrshort{io} operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block.
     35Therefore, one of the objective of this work is to introduce \emph{User-Level \glsxtrshort{io}}, which like \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations. This feature entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. The multiplexing requires a single \gls{proc} to execute multiple \glsxtrshort{io} operations in parallel. This requirement cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its blocking duration. Executing \glsxtrshort{io} operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} does not block.
    3436
    35 \section{Interoperating with \texttt{C}}
     37\section{Interoperating with C}
    3638While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the non-blocking challenge extends to all blocking system-calls. The POSIX standard states~\cite[\S~2.9.1]{POSIX17}:
    3739\begin{quote}
    38 All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions1 need not be thread-safe. ... (list of 70+ potentially excluded functions)
     40All functions defined by this volume of POSIX.1-2017 shall be thread-safe, except that the following functions need not be thread-safe. ... (list of 70+ excluded functions)
    3941\end{quote}
    40 Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthread lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
     42Only UNIX @man@ pages identify whether or not a library function is thread safe, and hence, may block on a pthreads lock or system call; hence interoperability with UNIX library functions is a challenge for an M:N threading model.
    4143
    4244Languages like Go and Java, which have strict interoperability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them, \eg a blocking function may be delegated to a \gls{kthrd}. Sandboxing may help towards guaranteeing that the kind of deadlock mentioned above does not occur.
     
    4547\begin{enumerate}
    4648        \item Precisely identifying blocking C calls is difficult.
    47         \item Introducing control points code can have a significant impact on general performance.
     49        \item Introducing safe-point code (see Go~page~\pageref{GoSafePoint}) can have a significant impact on general performance.
    4850\end{enumerate}
    49 Because of these consequences, this work does not attempt to ``sandbox'' calls to C. Therefore, it is possible calls from an unidentified library will block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. Therefore, a complete solution to this problem is outside the scope of this thesis.
     51Because of these consequences, this work does not attempt to ``sandbox'' calls to C. Therefore, it is possible calls to an unknown library function can block a \gls{kthrd} leading to deadlocks in \CFA's M:N threading model, which would not occur in a traditional 1:1 threading model. Currently, all M:N thread systems interacting with UNIX without sandboxing suffer from this problem but manage to work very well in the majority of applications. Therefore, a complete solution to this problem is outside the scope of this thesis.\footnote{\CFA does provide a pthreads emulation, so any library function using embedded pthreads locks are redirected to \CFA user-level locks. This capability further reduces the chances of blocking a \gls{kthrd}.}
  • libcfa/src/heap.cfa

    rfd365da rb01d459  
    509509        checkHeader( header < (Heap.Storage.Header *)heapBegin || (Heap.Storage.Header *)heapEnd < header, name, addr ); // bad address ? (offset could be + or -)
    510510
    511         Heap * homeManager;
    512511        if ( unlikely( freeHead == 0p || // freed and only free-list node => null link
    513512                                   // freed and link points at another free block not to a bucket in the bucket array.
  • src/AST/module.mk

    rfd365da rb01d459  
    3737        AST/Init.cpp \
    3838        AST/Init.hpp \
     39        AST/Inspect.cpp \
     40        AST/Inspect.hpp \
    3941        AST/Label.hpp \
    4042        AST/LinkageSpec.cpp \
  • src/CodeGen/CodeGenerator.cc

    rfd365da rb01d459  
    294294                                } else {
    295295                                        if ( obj->get_init() ) {
    296                                                 obj->get_init()->accept( *visitor ); 
     296                                                obj->get_init()->accept( *visitor );
    297297                                        } else {
    298298                                                // Should not reach here!
     
    683683                extension( variableExpr );
    684684                const OperatorInfo * opInfo;
    685                 if ( variableExpr->get_var()->get_linkage() == LinkageSpec::Intrinsic && (opInfo = operatorLookup( variableExpr->get_var()->get_name() )) && opInfo->type == OT_CONSTANT ) {
     685                if( dynamic_cast<ZeroType*>( variableExpr->get_var()->get_type() ) ) {
     686                        output << "0";
     687                } else if ( variableExpr->get_var()->get_linkage() == LinkageSpec::Intrinsic && (opInfo = operatorLookup( variableExpr->get_var()->get_name() )) && opInfo->type == OT_CONSTANT ) {
    686688                        output << opInfo->symbol;
    687689                } else {
    688                         // if (dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type()) 
     690                        // if (dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type())
    689691                        // && dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type())->baseEnum->base) {
    690692                        //      output << '(' <<genType(dynamic_cast<EnumInstType *>(variableExpr->get_var()->get_type())->baseEnum->base, "", options) << ')';
  • src/GenPoly/Box.cc

    rfd365da rb01d459  
    12771277                        FunctionType * ftype = functionDecl->type;
    12781278                        if ( ! ftype->returnVals.empty() && functionDecl->statements ) {
    1279                                 if ( ! isPrefix( functionDecl->name, "_thunk" ) && ! isPrefix( functionDecl->name, "_adapter" ) ) { // xxx - remove check for prefix once thunks properly use ctor/dtors
     1279                                // intrinsic functions won't be using the _retval so no need to generate it.
     1280                                if ( functionDecl->linkage != LinkageSpec::Intrinsic && !isPrefix( functionDecl->name, "_thunk" ) && ! isPrefix( functionDecl->name, "_adapter" ) ) { // xxx - remove check for prefix once thunks properly use ctor/dtors
    12801281                                        assert( ftype->returnVals.size() == 1 );
    12811282                                        DeclarationWithType * retval = ftype->returnVals.front();
  • src/Validate/Autogen.cpp

    rfd365da rb01d459  
    2828#include "AST/DeclReplacer.hpp"
    2929#include "AST/Expr.hpp"
     30#include "AST/Inspect.hpp"
    3031#include "AST/Pass.hpp"
    3132#include "AST/Stmt.hpp"
     
    121122
    122123        // Built-ins do not use autogeneration.
    123         bool shouldAutogen() const final { return !decl->linkage.is_builtin; }
     124        bool shouldAutogen() const final { return !decl->linkage.is_builtin && !structHasFlexibleArray(decl); }
    124125private:
    125126        void genFuncBody( ast::FunctionDecl * decl ) final;
     
    183184        {
    184185                // TODO: These functions are somewhere between instrinsic and autogen,
    185                 // could possibly use a new linkage type. For now we just make them
    186                 // intrinsic to code-gen them as C assignments.
    187                 proto_linkage = ast::Linkage::Intrinsic;
     186                // could possibly use a new linkage type. For now we just make the
     187                // basic ones intrinsic to code-gen them as C assignments.
     188                const auto & real_type = decl->base;
     189                const auto & basic = real_type.as<ast::BasicType>();
     190                if(!real_type || (basic && basic->isInteger())) proto_linkage = ast::Linkage::Intrinsic;
    188191        }
    189192
     
    402405        auto retval = srcParam();
    403406        retval->name = "_ret";
    404         // xxx - Adding this unused attribute can slience unused variable warning
    405         // However, some code might not be compiled as expected
    406         // Temporarily disabled
    407         // retval->attributes.push_back(new ast::Attribute("unused"));
    408407        return genProto( "?=?", { dstParam(), srcParam() }, { retval } );
    409408}
  • tests/.expect/attributes.nast.arm64.txt

    rfd365da rb01d459  
    13341334    }
    13351335    inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){
    1336         enum __anonymous4 _X4_retM12__anonymous4_2;
    13371336        {
    13381337            ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2));
  • tests/.expect/attributes.nast.x64.txt

    rfd365da rb01d459  
    13341334    }
    13351335    inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){
    1336         enum __anonymous4 _X4_retM12__anonymous4_2;
    13371336        {
    13381337            ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2));
  • tests/.expect/attributes.nast.x86.txt

    rfd365da rb01d459  
    13341334    }
    13351335    inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){
    1336         enum __anonymous4 _X4_retM12__anonymous4_2;
    13371336        {
    13381337            ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2));
  • tests/.expect/attributes.oast.x64.txt

    rfd365da rb01d459  
    13341334    }
    13351335    inline enum __anonymous4 _X16_operator_assignFM12__anonymous4_M12__anonymous4M12__anonymous4_intrinsic___2(enum __anonymous4 *_X4_dstM12__anonymous4_2, enum __anonymous4 _X4_srcM12__anonymous4_2){
    1336         enum __anonymous4 _X4_retM12__anonymous4_2;
    13371336        {
    13381337            ((void)((*_X4_dstM12__anonymous4_2)=_X4_srcM12__anonymous4_2));
Note: See TracChangeset for help on using the changeset viewer.