Changeset 0ab3b73


Ignore:
Timestamp:
Oct 30, 2020, 12:36:35 PM (4 years ago)
Author:
Thierry Delisle <tdelisle@…>
Branches:
ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast-unique-expr, pthread-emulation, qualifiedEnum
Children:
51e7583
Parents:
99581ee (diff), f7e4f8e8 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

Location:
doc/theses/thierry_delisle_PhD
Files:
2 added
12 edited

Legend:

Unmodified
Added
Removed
  • doc/theses/thierry_delisle_PhD/comp_II/presentation.tex

    r99581ee r0ab3b73  
    3636        \miniframeson
    3737}
    38 \section{\CFA and Concurrency}
     38\section{Concurrency and \CFA}
     39\begin{frame}{Project}
     40        \begin{center}
     41                {\large Produce a scheduler for \CFA that is simple for programmers to understand and offers good general performance.}
     42        \end{center}
     43\end{frame}
     44%------------------------------
    3945\begin{frame}{\CFA}
     46        \CFA is a modern extension of C.
     47        It adds to C : overloading, constructors/destructors, polymorphism, and much more.
     48
     49        ~\newline
     50        For this project, the relevant aspects are:
     51        \begin{itemize}
     52                \item Fast and safe system language.
     53                \item Threading.
     54                \item Manual memory management.
     55        \end{itemize}
    4056
    4157\end{frame}
     
    104120\begin{frame}{Priority Scheduling}
    105121        \begin{center}
    106         {\large
     122                {\large
    107123                        Runs all ready threads in group \textit{A} before any ready threads in group \textit{B}.
    108124                }
     
    136152
    137153        Processors begin busy for long periods can mean starvation.
     154\end{frame}
     155%------------------------------
     156\begin{frame}{Scheduling in Practice: Summary}
     157        \begin{columns}
     158                \begin{column}{0.5\textwidth}
     159                        \textbf{Feedback Scheduling}
     160                        \newline
     161
     162                        \begin{itemize}
     163                                \item Inappropriate for short lived threads.
     164                                \item Overkill for cooperating threads.\newline
     165                        \end{itemize}
     166                \end{column}
     167                \begin{column}{0.5\textwidth}
     168                        \textbf{Priority Scheduling}
     169                        \newline
     170
     171                        \begin{itemize}
     172                                \item Allows lasting starvation.\newline
     173                                \item Hard to reason about.\newline~\newline
     174                        \end{itemize}
     175                \end{column}
     176        \end{columns}
     177
     178        ~\newline
     179        ~\newline
     180        \CFA would benefit from something different.
    138181\end{frame}
    139182%==============================
     
    190233        \begin{itemize}
    191234                \item Acquire for reading for normal scheduling operations.
    192                 \item Acquire for right when resizing the array and creating/deleting internal queues.
     235                \item Acquire for writing when resizing the array and creating/deleting internal queues.
    193236        \end{itemize}
    194237\end{frame}
     
    314357        Runtime system and scheduling are still open topics.
    315358        \newline
     359        \newline
    316360
    317361        This work offers a novel runtime and scheduling package.
     362        \newline
    318363        \newline
    319364
     
    336381
    337382%------------------------------
    338 \begin{frame}{Timeline}
     383\begin{frame}{}
    339384        \begin{center}
    340385                {\large Questions?}
  • doc/theses/thierry_delisle_PhD/comp_II/presentationstyle.sty

    r99581ee r0ab3b73  
    2020\setbeamertemplate{blocks}[rounded][shadow=false]
    2121\newcommand\xrowht[2][0]{\addstackgap[.5\dimexpr#2\relax]{\vphantom{#1}}}
     22\setbeamertemplate{sections/subsections in toc}{\inserttocsectionnumber.~\inserttocsection}
    2223
    2324%==============================
     
    3637\setbeamercolor{palette primary}{bg=colbg}
    3738\setbeamercolor{palette tertiary}{fg=red}
     39\setbeamercolor{section in toc}{fg=white}
     40\setbeamercolor{subsection in toc}{fg=gray}
    3841
    3942%==============================
  • doc/theses/thierry_delisle_PhD/thesis/Makefile

    r99581ee r0ab3b73  
    1515        front \
    1616        intro \
     17        existing \
    1718        runtime \
    1819        core \
     
    2728        base \
    2829        empty \
     30        system \
    2931}
    3032
     
    3739## Define the documents that need to be made.
    3840all: thesis.pdf
    39 thesis.pdf: ${TEXTS} ${FIGURES} ${PICTURES} glossary.tex
     41thesis.pdf: ${TEXTS} ${FIGURES} ${PICTURES} glossary.tex local.bib
    4042
    4143DOCUMENT = thesis.pdf
  • doc/theses/thierry_delisle_PhD/thesis/fig/system.fig

    r99581ee r0ab3b73  
    1 #FIG 3.2  Produced by xfig version 3.2.5c
     1#FIG 3.2  Produced by xfig version 3.2.7b
    22Landscape
    33Center
     
    36361 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 3600 15 15 4500 3600 4515 3615
    3737-6
    38 6 3225 4125 4650 4425
    39 6 4350 4200 4650 4350
    40 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4425 4275 15 15 4425 4275 4440 4290
    41 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 4275 15 15 4500 4275 4515 4290
    42 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4575 4275 15 15 4575 4275 4590 4290
    43 -6
    44 1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3450 4275 225 150 3450 4275 3675 4425
    45 1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4050 4275 225 150 4050 4275 4275 4425
    46 -6
    47 6 6675 4125 7500 4425
    48 6 7200 4200 7500 4350
    49 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7275 4275 15 15 7275 4275 7290 4290
    50 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7350 4275 15 15 7350 4275 7365 4290
    51 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7425 4275 15 15 7425 4275 7440 4290
    52 -6
    53 1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6900 4275 225 150 6900 4275 7125 4425
    54 -6
    55386 6675 3525 8025 3975
    56392 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
     
    79621 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 2850 150 150 3975 2850 4125 2850
    80631 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 7200 2775 150 150 7200 2775 7350 2775
    81 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4860
     641 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830
    82651 3 0 1 0 0 0 0 0 0.000 1 0.0000 7200 2775 30 30 7200 2775 7230 2805
    83661 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 3600 150 150 3525 3600 3675 3600
    84 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3875 4800 100 100 3875 4800 3975 4800
    85 1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4650 4800 150 75 4650 4800 4800 4875
     671 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4625 4838 100 100 4625 4838 4725 4838
    86682 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
    8769         2400 4200 2400 3750 1950 3750 1950 4200 2400 4200
     
    153135        1 1 1.00 45.00 90.00
    154136         7875 3750 7875 2325 7200 2325 7200 2550
     1372 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
     138         6975 4950 6750 4950 6750 4725 6975 4725 6975 4950
    1551392 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
    156140         5850 4950 5850 4725 5625 4725 5625 4950 5850 4950
    157 2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
    158          6975 4950 6750 4950 6750 4725 6975 4725 6975 4950
    159 4 1 -1 0 0 0 10 0.0000 2 105 720 5550 4425 Processors\001
    160 4 1 -1 0 0 0 10 0.0000 2 120 1005 4200 3225 Blocked Tasks\001
    161 4 1 -1 0 0 0 10 0.0000 2 150 870 4200 3975 Ready Tasks\001
    162 4 1 -1 0 0 0 10 0.0000 2 135 1095 7350 1725 Other Cluster(s)\001
    163 4 1 -1 0 0 0 10 0.0000 2 105 840 4650 1725 User Cluster\001
    164 4 1 -1 0 0 0 10 0.0000 2 150 615 2175 3675 Manager\001
    165 4 1 -1 0 0 0 10 0.0000 2 105 990 2175 3525 Discrete-event\001
    166 4 1 -1 0 0 0 10 0.0000 2 135 795 2175 4350 preemption\001
    167 4 0 -1 0 0 0 10 0.0000 2 150 1290 2325 4875 generator/coroutine\001
    168 4 0 -1 0 0 0 10 0.0000 2 120 270 4050 4875 task\001
    169 4 0 -1 0 0 0 10 0.0000 2 105 450 7050 4875 cluster\001
    170 4 0 -1 0 0 0 10 0.0000 2 105 660 5925 4875 processor\001
    171 4 0 -1 0 0 0 10 0.0000 2 105 555 4875 4875 monitor\001
     1414 1 -1 0 0 0 10 0.0000 2 135 900 5550 4425 Processors\001
     1424 1 -1 0 0 0 10 0.0000 2 165 1170 4200 3975 Ready Threads\001
     1434 1 -1 0 0 0 10 0.0000 2 165 1440 7350 1725 Other Cluster(s)\001
     1444 1 -1 0 0 0 10 0.0000 2 135 1080 4650 1725 User Cluster\001
     1454 1 -1 0 0 0 10 0.0000 2 165 630 2175 3675 Manager\001
     1464 1 -1 0 0 0 10 0.0000 2 135 1260 2175 3525 Discrete-event\001
     1474 1 -1 0 0 0 10 0.0000 2 150 900 2175 4350 preemption\001
     1484 0 -1 0 0 0 10 0.0000 2 135 630 7050 4875 cluster\001
     1494 1 -1 0 0 0 10 0.0000 2 135 1350 4200 3225 Blocked Threads\001
     1504 0 -1 0 0 0 10 0.0000 2 135 540 4800 4875 thread\001
     1514 0 -1 0 0 0 10 0.0000 2 120 810 5925 4875 processor\001
     1524 0 -1 0 0 0 10 0.0000 2 165 1710 2325 4875 generator/coroutine\001
  • doc/theses/thierry_delisle_PhD/thesis/glossary.tex

    r99581ee r0ab3b73  
    11\makeglossaries
     2
     3% ----------------------------------
     4% Acronyms
     5\newacronym{api}{API}{Application Programming Interface}
     6\newacronym{fifo}{FIFO}{First-In, First-Out}
     7\newacronym{io}{I/O}{Input and Output}
     8\newacronym{numa}{NUMA}{Non-Uniform Memory Access}
     9\newacronym{raii}{RAII}{Resource Acquisition Is Initialization}
     10\newacronym{tls}{TLS}{Thread Local Storage}
     11
     12% ----------------------------------
     13% Definitions
     14
     15\longnewglossaryentry{thrd}
     16{name={thread}}
     17{
     18Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
     19
     20\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
     21}
     22
     23\longnewglossaryentry{proc}
     24{name={processor}}
     25{
     26
     27}
     28
     29\longnewglossaryentry{rQ}
     30{name={ready-queue}}
     31{
     32
     33}
     34
     35\longnewglossaryentry{uthrding}
     36{name={user-level threading}}
     37{
     38
     39
     40\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
     41}
     42
     43% ----------------------------------
    244
    345\longnewglossaryentry{hthrd}
    446{name={hardware thread}}
    547{
    6 Threads representing the underlying hardware directly.
     48Threads representing the underlying hardware directly, \eg the CPU core, or hyper-thread if the hardware supports multiple threads of execution per core. The number of hardware threads is considered to be always fixed to a specific number determined by the hardware.
    749
    8 \textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
    9 }
    10 
    11 \longnewglossaryentry{thrd}
    12 {name={threads}}
    13 {
    14 Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
    15 
    16 \textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
     50\textit{Synonyms : }
    1751}
    1852
     
    5791}
    5892
    59 \longnewglossaryentry{proc}
    60 {name={virtual processor}}
    61 {
    6293
    63 }
    64 
    65 \longnewglossaryentry{Q}
    66 {name={work-queue}}
    67 {
    68 
    69 }
    7094
    7195\longnewglossaryentry{at}
     
    131155}
    132156
    133 
    134 \newacronym{tls}{TLS}{Thread Local Storage}
    135 \newacronym{api}{API}{Application Program Interface}
    136 \newacronym{raii}{RAII}{Resource Acquisition Is Initialization}
    137 \newacronym{numa}{NUMA}{Non-Uniform Memory Access}
  • doc/theses/thierry_delisle_PhD/thesis/text/core.tex

    r99581ee r0ab3b73  
    11\chapter{Scheduling Core}\label{core}
    22
    3 This chapter addresses the need of scheduling on a somewhat ideal scenario
     3Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenerio, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and but the system has the ressources necessary to accomplish the work. In short, the system is neither overloaded or underloaded.
    44
    5 \section{Existing Schedulers}
    6 \subsection{Feedback Scheduling}
     5I believe it is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to the new load and return to the steady state. Flaws in the scheduling in the steady state tend therefore to be pervasive in all states.
    76
    8 \subsection{Priority Scheduling}\label{priority}
     7\section{Design Goals}
     8As with most of the design decisions behind \CFA, the main goal is to match the expectation of the programmer, according to their probable mental model. To match these expectations, the design must offer the programmers sufficient guarantees so that, as long as the programmer respects the mental model, the system will also respect this model.
    99
    10 \subsection{Work Stealing}
     10For threading, a simple and common mental model is the ``Ideal multi-tasking CPU'' :
     11
     12\begin{displayquote}[Linux CFS\cit{https://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt}]
     13        {[The]} ``Ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 tasks running, then it runs each at 50\% physical power --- i.e., actually in parallel.
     14\end{displayquote}
     15
     16Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
     17
     18In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with eachother but simply share the hardware. This makes it easier to reason about threading because ready \glspl{thrd} can be taken in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer 2 guarantees:
     19\begin{enumerate}
     20        \item A fairness guarantee: a \gls{thrd} that is ready to run will not be prevented to do so by another thread.
     21        \item A performance guarantee: a \gls{thrd} that wants to start or stop running will not be slowed down by other threads wanting to do the same.
     22\end{enumerate}
     23
     24It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still need to share a limited amount of hardware. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware, even if that share is very small.
     25
     26Similarly the performance guarantee, the lack of interferance between threads is only relevant op to a point. Ideally the cost of running and blocking would be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document the performance experimentation will attempt to show that the cost of scheduling is not a major factor in application performance. This demonstration can be made by comparing application built in \CFA to applications built with other languages or other models. If the performance of an application built in \CFA is not meaningfully different than one built with a different runtime, then the scheduler has a negigeable impact on performance, \ie its impact can be ignored. Recall from a few paragraphs ago that the expectation of programmers is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is not a significant portion of the runtime of several different application, I will consider the guarantee achieved.
     27
     28\todo{This paragraph should be moved later}
     29% The next step is then to decide what is considered a \emph{fair share}, \ie what metric is used to measure fairness. Since \CFA is intended to allow numerous short lived threads, I decided to avoid total CPU time as the measure of fairness. Total CPU time inherently favors new \glspl{thrd} over older ones which isn't necessarily a good thing. Instead, fairness is measured in terms of opportunities to run. This metric is more appropriate for a mix of short and long lived \glspl{thrd}.
    1130
    1231\section{Design}
    13 While avoiding the pitfalls of Feedback Scheduling is fairly easy, scheduling does not innately require feedback, avoiding prioritization of \glspl{thrd} is more difficult because of implicitly priorities, see Subsection~\ref{priority}.
     32While avoiding the pitfalls of Feedback Scheduling is fairly easy, scheduling does not innately require feedback, avoiding prioritization of \glspl{thrd} is more difficult because of implicitly priorities, see Subsection~\ref{priority}. A strictly \glsxtrshort{fifo} rea
    1433
    1534\subsection{Sharding}
     
    1938                \input{base.pstex_t}
    2039        \end{center}
    21         \caption{Relaxed FIFO list at the base of the scheduler: an array of strictly FIFO lists.
    22         The timestamp is in all nodes and cell arrays.}
     40        \caption{Relaxed FIFO list}
    2341        \label{fig:base}
     42        List at the base of the scheduler: an array of strictly FIFO lists.
     43        The timestamp is in all nodes and cell arrays.
    2444\end{figure}
    2545
     
    2848Indeed, if the number of \glspl{thrd} does not far exceed the number of queues, it is probable that several of these queues are empty.
    2949Figure~\ref{fig:empty} shows an example with 2 \glspl{thrd} running on 8 queues, where the chances of getting an empty queue is 75\% per pick, meaning two random picks yield a \gls{thrd} only half the time.
    30 This can lead to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
    31 
    32 Solutions to this problem can take many forms, but they ultimately all have to encode where the threads are in some form. My results show that the density and locality of this encoding is generally the dominating factor in these scheme.
    33 
    34 \paragraph{Dense Information}
    35 
    36 
    37 
    3850
    3951
     
    4254                \input{empty.pstex_t}
    4355        \end{center}
    44         \caption{``More empty'' state of the queue: the array contains many empty cells.}
     56        \caption{``More empty'' Relaxed FIFO list}
    4557        \label{fig:empty}
     58        Emptier state of the queue: the array contains many empty cells, that is strictly FIFO lists containing no elements.
    4659\end{figure}
     60
     61This can lead to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
     62
     63Solutions to this problem can take many forms, but they ultimately all have to encode where the threads are in some form. My results show that the density and locality of this encoding is generally the dominating factor in these scheme.
     64
     65\paragraph{Dense Information}
  • doc/theses/thierry_delisle_PhD/thesis/text/front.tex

    r99581ee r0ab3b73  
    184184\phantomsection         % allows hyperref to link to the correct page
    185185
     186% TODOs and missing citations
     187% -----------------------------
    186188\listofcits
    187189\listoftodos
     190\cleardoublepage
     191\phantomsection         % allows hyperref to link to the correct page
    188192
    189193
  • doc/theses/thierry_delisle_PhD/thesis/text/intro.tex

    r99581ee r0ab3b73  
    1 \chapter{Introduction}
     1\chapter*{Introduction}\label{intro}
     2\todo{A proper intro}
     3
     4The C programming language\cit{C}
     5
     6The \CFA programming language\cite{cfa:frontpage,cfa:typesystem} which extends the C programming language to add modern safety and productiviy features while maintaining backwards compatibility. Among it's productiviy features, \CFA introduces support for threading\cit{CFA Concurrency}, to allow programmers to write modern concurrent and parallel programming.
     7While previous work on the concurrent package of \CFA focused on features and interfaces, this thesis focuses on performance, introducing \glsxtrshort{api} changes only when required by performance considerations. More specifically, this thesis concentrates on scheduling and \glsxtrshort{io}. Prior to this work, the \CFA runtime used a strictly \glsxtrshort{fifo} \gls{rQ}.
     8
     9This work exclusively concentrates on Linux as it's operating system since the existing \CFA runtime and compiler does not already support other operating systems. Furthermore, as \CFA is yet to be released, supporting version of Linux older that the latest version is not a goal of this work.
  • doc/theses/thierry_delisle_PhD/thesis/text/io.tex

    r99581ee r0ab3b73  
    1 \chapter{I/O}
     1\chapter{User Level \glsxtrshort{io}}
     2As mentionned in Section~\ref{prev:io}, User-Level \glsxtrshort{io} requires multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \glsxtrshort{io} operations. Various operating systems offer various forms of asynchronous operations and as mentioned in Chapter~\ref{intro}, this work is exclusively focuesd on Linux.
    23
    34\section{Existing options}
     5Since \glsxtrshort{io} operations are generally handled by the
    46
    57\subsection{\texttt{epoll}, \texttt{poll} and \texttt{select}}
     
    79\subsection{Linux's AIO}
    810
     11
     12
     13\begin{displayquote}
     14        AIO is a horrible ad-hoc design, with the main excuse being "other,
     15        less gifted people, made that design, and we are implementing it for
     16        compatibility because database people - who seldom have any shred of
     17        taste - actually use it".
     18
     19        But AIO was always really really ugly.
     20
     21        \begin{flushright}
     22                -- Linus Torvalds\cit{https://lwn.net/Articles/671657/}
     23        \end{flushright}
     24\end{displayquote}
     25
     26Interestingly, in this e-mail answer, Linus goes on to describe
     27``a true \textit{asynchronous system call} interface''
     28that does
     29``[an] arbitrary system call X with arguments A, B, C, D asynchronously using a kernel thread''
     30in
     31``some kind of arbitrary \textit{queue up asynchronous system call} model''.
     32This description is actually quite close to the interface of the interface described in the next section.
     33
    934\subsection{\texttt{io\_uring}}
     35A very recent addition to Linux, \texttt{io\_uring}\cit{io\_uring} is a framework that aims to solve many of the problems listed with the above mentioned solutions.
    1036
    11 \subsection{Extra Kernel Threads}
     37\subsection{Extra Kernel Threads}\label{io:morethreads}
     38Finally, if the operating system does not offer any satisfying forms of asynchronous \glsxtrshort{io} operations, a solution is to fake it by creating a pool of \glspl{kthrd} and delegating operations to them in order to avoid blocking \glspl{proc}.
    1239
    1340\subsection{Discussion}
  • doc/theses/thierry_delisle_PhD/thesis/text/practice.tex

    r99581ee r0ab3b73  
    22The scheduling algorithm discribed in Chapter~\ref{core} addresses scheduling in a stable state.
    33However, it does not address problems that occur when the system changes state.
    4 Indeed the \CFA runtime, supports expanding and shrinking
    5 
    6 the number of KTHREAD\_place
    7 
    8 , both manually and, to some extent automatically.
     4Indeed the \CFA runtime, supports expanding and shrinking the number of KTHREAD\_place \todo{add kthrd to glossary}, both manually and, to some extent automatically.
    95This entails that the scheduling algorithm must support these transitions.
    106
  • doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex

    r99581ee r0ab3b73  
    11\chapter{\CFA Runtime}
     2This chapter offers an overview of the capabilities of the \CFA runtime prior to this work.
    23
    3 \section{M:N Threading}
     4Threading in \CFA offers is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, that is: programmers should be able to create a large number of \glspl{thrd} and switch between \glspl{thrd} liberally without many concerns for performance.
     5
     6\section{M:N Threading}\label{prev:model}
     7
     8C traditionnally uses a 1:1 threading model. This model uses \glspl{kthrd} to achive parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completly true due to primitives like \texttt{futex}es, which have a significant portion of their logic in user space.}.
     9
     10By contrast \CFA uses an M:N threading models, where concurrency is achieved using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model, they represent an independant thread of execution with it's on stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then choses a different \gls{thrd} to run.
    411
    512\section{Clusters}
     13\begin{figure}
     14        \begin{center}
     15                \input{system.pstex_t}
     16        \end{center}
     17        \caption{Overview of the \CFA runtime}
     18        \label{fig:system}
     19        \Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.
     20\end{figure}
     21\CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} will only be scheduled onto \glspl{proc} in the same cluster and scheduling is done independantly of other clusters. Figure~\ref{fig:system} shows an overview if this system. This allows programmers to control more tightly parallelism. It also opens the door to handling effects like NUMA, by pining clusters to specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
     22
     23\section{Scheduling}
     24The \CFA runtime was previously using a strictly \glsxtrshort{fifo} ready queue with a single lock. This setup offers perfect fairness in terms of opportunities to run/ However, it offers poor scalability, since the performance of the ready queue can never be improved by adding more \glspl{hthrd}, but the contention can cause significant performance degradation.
     25
     26\section{\glsxtrshort{io}}\label{prev:io}
     27Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. \CFA being built on C, this means that, while all the operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model\cit{pthreads}. Using these operations in a M:N threading model, when they are built for 1:1 threading, means that operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. This also means that deadlocks can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
     28
     29Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, the first \gls{thrd} is still ready to run and should just be able to get CPU time and send the request. In practice with M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} in this example will \emph{not} try to run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is effectively deadlocked\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
     30
     31One of the objective of this work, is to introduce \emph{User-Level \glsxtrshort{io}} which, as a parallel to \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations. This entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple operations in parallel. This cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its duration. Executing operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} is not blocked.
    632
    733\section{Interoperating with \texttt{C}}
     34While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the challenges mentioned in the previous section do not require \glsxtrshort{io} to be involved. These challenges are a product of blocking system calls rather than \glsxtrshort{io}. C offers no tools to identify whether or not a librairy function will lead to a blocking system call. This fact means interoperatability with C becomes a challenge in a M:N threading model.
     35
     36Languages like Go and Java, which have strict interoperatability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them. They can, for example, delegate C operations to \glspl{kthrd} that are not \glspl{proc}. Sandboxing may help towards guaranteeing that the deadlocks mentioned in the previous section do not occur.
     37
     38As mentioned in Section~\cit{\CFA intro}, \CFA is binary compatible with C and, as such, trivially supports calls to and from C librairies. Furthermore, interoperatability can happen within a single library, through inline code or simply C and \CFA translation units archived together. The fine-grained interoperatability between C and \CFA has two consequences:
     39\begin{enumerate}
     40        \item Precisely identifying C calls that could block is difficult.
     41        \item Introducing code where interoperatability occurs could have a significant impact on general performance.
     42\end{enumerate}
     43
     44Because of these consequences, this work does not attempt to ``sandbox'' calls to C. It is possible that conflicting calls to C could lead to deadlocks on \CFA's M:N threading model where they would not in the traditionnal 1:1 threading model. However, I judge that solving this problem in general, in a way that is composable and flexible, is too complex in itself and would add too much work to this thesis. Therefore it is outside the scope of this thesis.
  • doc/theses/thierry_delisle_PhD/thesis/thesis.tex

    r99581ee r0ab3b73  
    121121% installation instructions there.
    122122
     123\usepackage{csquotes}
     124\usepackage{indentfirst} % as any self-respecting frenchman would
     125
    123126% Setting up the page margins...
    124127% uWaterloo thesis requirements specify a minimum of 1 inch (72pt) margin at the
     
    218221% separate documents, they would each start with the \chapter command, i.e,
    219222% do not contain \documentclass or \begin{document} and \end{document} commands.
     223\part{Introduction}
    220224\input{text/intro.tex}
     225\input{text/existing.tex}
    221226\input{text/runtime.tex}
     227\part{Design}
    222228\input{text/core.tex}
    223229\input{text/practice.tex}
    224230\input{text/io.tex}
     231\part{Evaluation}
     232\chapter{Theoretical and Existance Proofs}
     233\chapter{Micro-Benchmarks}
     234\chapter{Larger-Scale applications}
     235\part{Conclusion \& Annexes}
    225236
    226237%----------------------------------------------------------------------
     
    245256\addcontentsline{toc}{chapter}{\textbf{References}}
    246257
    247 \bibliography{uw-ethesis}
     258\bibliography{local}
    248259% Tip 5: You can create multiple .bib files to organize your references.
    249260% Just list them all in the \bibliogaphy command, separated by commas (no spaces).
    250261
    251 % The following statement causes the specified references to be added to the bibliography% even if they were not
    252 % cited in the text. The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
    253 \nocite{*}
     262% % The following statement causes the specified references to be added to the bibliography% even if they were not
     263% % cited in the text. The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
     264% \nocite{*}
    254265
    255266% The \appendix statement indicates the beginning of the appendices.
Note: See TracChangeset for help on using the changeset viewer.