Changeset 70cab43


Ignore:
Timestamp:
Sep 1, 2020, 8:06:36 PM (4 years ago)
Author:
m3zulfiq <m3zulfiq@…>
Branches:
ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast-unique-expr, pthread-emulation, qualifiedEnum
Children:
4557bcf7
Parents:
68f0c4e (diff), a77496cb (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge branch 'master' of plg.uwaterloo.ca:software/cfa/cfa-cc

Files:
5 added
6 edited

Legend:

Unmodified
Added
Removed
  • doc/theses/thierry_delisle_PhD/comp_II/Makefile

    r68f0c4e r70cab43  
    1212
    1313## Define the text source files.
    14 
    15 SOURCES = ${addsuffix .tex, \
    16 comp_II \
    17 }
    18 
    1914FIGURES = ${addsuffix .tex, \
    20         base \
    21         empty \
    2215        emptybit \
    2316        emptytree \
    2417        emptytls \
    2518        resize \
    26         system \
    2719}
    2820
    2921PICTURES = ${addsuffix .pstex, \
     22        base \
     23        empty \
     24        system \
    3025}
    3126
     
    3732
    3833## Define the documents that need to be made.
     34all: comp_II.pdf presentation.pdf
     35comp_II.pdf: ${FIGURES} ${PICTURES}
     36presentation.pdf: presentationstyle.sty base.dark.pstex empty.dark.pstex system.dark.pstex
    3937
    40 DOCUMENT = comp_II.pdf
     38DOCUMENT = comp_II.pdf presentation.pdf
    4139BASE = ${basename ${DOCUMENT}}
    4240
     
    5250# File Dependencies #
    5351
    54 ${DOCUMENT} : ${BASE}.ps
     52%.pdf : build/%.ps | ${Build}
    5553        ps2pdf $<
    5654
    57 ${BASE}.ps : ${BASE}.dvi
    58         dvips ${Build}/$< -o $@
     55build/%.ps : build/%.dvi | ${Build}
     56        dvips $< -o $@
    5957
    60 ${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \
    61                 ${Macros}/common.tex ${Macros}/indexstyle ../../../bibliography/pl.bib \
    62                 local.bib glossary.tex | ${Build}
     58build/%.dvi : %.tex Makefile | ${Build}
    6359        # Must have *.aux file containing citations for bibtex
    64         if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi
    65         -${BibTeX} ${Build}/${basename $@}
     60        if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} $< ; fi
     61        -${BibTeX} ${basename $@}
    6662        # Some citations reference others so run again to resolve these citations
    67         ${LaTeX} ${basename $@}.tex
    68         -${BibTeX} ${Build}/${basename $@}
     63        ${LaTeX} $<
     64        -${BibTeX} ${basename $@}
    6965        # Make index from *.aux entries and input index at end of document
    70         makeglossaries -q -s ${Build}/${basename $@}.ist ${Build}/${basename $@}
     66        -makeglossaries -q -s ${basename $@}.ist ${basename $@}
    7167        # Run again to finish citations
    72         ${LaTeX} ${basename $@}.tex
     68        ${LaTeX} $<
    7369
    7470## Define the default recipes.
     
    7773        mkdir -p ${Build}
    7874
    79 %.tex : img/%.fig ${Build}
     75%.tex : img/%.fig | ${Build}
    8076        fig2dev -L eepic $< > ${Build}/$@
    8177
     
    8783        fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t
    8884
     85## pstex with inverted colors
     86%.dark.pstex : img/%.fig Makefile | ${Build}
     87        fig2dev -L pstex $< > ${Build}/$@
     88        sed -i 's/\/col-1 {0 setgray} bind def/\/col-1 {1 setgray} bind def/g' ${Build}/$@
     89        sed -i 's/\/col0 {0.000 0.000 0.000 srgb} bind def/\/col0 {1.000 1.000 1.000 srgb} bind def/g' ${Build}/$@
     90        sed -i 's/\/col7 {1.000 1.000 1.000 srgb} bind def/\/col7 {0.000 0.000 0.000 srgb} bind def/g' ${Build}/$@
     91        fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t
     92
    8993# Local Variables: #
    9094# compile-command: "make" #
  • doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex

    r68f0c4e r70cab43  
    6363It aims to add high-productivity features while maintaining the predictable performance of C.
    6464As such, concurrency in \CFA\cite{Delisle19} aims to offer simple and safe high-level tools while still allowing performant code.
    65 \CFA concurrent code is written in the synchronous programming paradigm but uses \glspl{uthrd} in order to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programing.
     65\CFA concurrent code is written in the synchronous programming paradigm but uses \glspl{uthrd} in order to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programming.
    6666As such, the \CFA \newterm{scheduler} is a preemptive user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}.
    6767
     
    7575and the cost of scheduling, \ie deciding which thread to run next among all the threads ready to run.
    7676\end{enumerate}
    77 The first cost is generally constant and fixed\footnote{Affecting the constant context-switch cost is whether it is done in one step, after the scheduling, or in two steps, context-switching to a fixed third-thread before scheduling.}, while the scheduling cost can vary based on the system state.
    78 Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, \ie \newterm{linearizability}\footnote{Meaning however fast the CPU threads run, there is an equivalent sequential order that gives the same result.}, and a new dimension to performance: scalability, where scheduling cost now also depends on contention.
    79 
     77The first cost is generally constant and fixed\footnote{Affecting the constant context-switch cost is whether it is done in one step, after the scheduling, or in two steps, context-switching to a third fixed thread before scheduling.}, while the scheduling cost can vary based on the system state.
     78Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, \ie \newterm{linearizability}\footnote{Meaning, however fast the CPU threads run, there is an equivalent sequential order that gives the same result.}, and a new dimension to performance: scalability, where scheduling cost now also depends on contention.
    8079The more threads switch, the more the administration cost of scheduling becomes noticeable.
    8180It is therefore important to build a scheduler with the lowest possible cost and latency.
     
    8483While the illusion of simultaneity is easier to reason about, it can break down if the scheduler allows too much unfairness.
    8584Therefore, the scheduler should offer as much fairness as needed to guarantee eventual progress, but use unfairness to help performance.
    86 In practice, threads must wait in turn but there can be advantages to unfair scheduling, similar to the the express cash-register at a grocery store.
     85In practice, threads must wait in turn but there can be advantages to unfair scheduling, similar to the express cash register at a grocery store.
    8786
    8887The goal of this research is to produce a scheduler that is simple for programmers to understand and offers good performance.
     
    9392\end{quote}
    9493
    95 For a general purpose scheduler, it is impossible to produce an optimal algorithm as it would require knowledge of the future behaviour of threads.
    96 As such, scheduling performance is generally either defined by the best case scenario, \ie a workload to which the scheduler is tailored, or the worst case scenario, \ie the scheduler behaves no worst than \emph{X}.
     94For a general-purpose scheduler, it is impossible to produce an optimal algorithm as it would require knowledge of the future behaviour of threads.
     95As such, scheduling performance is generally either defined by the best-case scenario, \ie a workload to which the scheduler is tailored, or the worst-case scenario, \ie the scheduler behaves no worse than \emph{X}.
    9796For this proposal, the performance is evaluated using the second approach to allow \CFA programmers to rely on scheduling performance.
    9897Because there is no optimal scheduler, ultimately \CFA may allow programmers to write their own scheduler; but that is not the subject of this proposal, which considers only the default scheduler.
     
    101100To achieve the \CFA scheduling goal includes:
    102101\begin{enumerate}
    103 \item
    104 producing a scheduling strategy with sufficient fairness guarantees,
    105 \item
    106 creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily,
    107 \item
    108 scheduling blocking I/O operations,
    109 \item
    110 and writing sufficient library tools to allow developers to indirectly use the scheduler, either through tuning knobs or replacing the default scheduler.
     102        \item producing a scheduling strategy with sufficient fairness guarantees,
     103        \item creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily,
     104        \item scheduling blocking I/O operations,
     105        \item and writing sufficient library tools to allow developers to indirectly use the scheduler, either through tuning knobs or replacing the default scheduler.
    111106\end{enumerate}
    112107
     
    119114\paragraph{Correctness} As with any other concurrent data structure or algorithm, the correctness requirement is paramount.
    120115The scheduler cannot allow threads to be dropped from the ready queue, \ie scheduled but never run, or be executed multiple times when only being scheduled once.
    121 Since \CFA concurrency has no spurious wakeup, this definition of correctness also means the scheduler should have no spurious wakeup.
     116Since \CFA concurrency has no spurious wake up, this definition of correctness also means the scheduler should have no spurious wake up.
    122117The \CFA scheduler must be correct.
    123118
     
    130125The \CFA scheduler should offer good performance for all three metrics.
    131126
    132 \paragraph{Fairness} Like performance, this requirement has several aspect : eventual progress, predictability and performance reliability.
     127\paragraph{Fairness} Like performance, this requirement has several aspects : eventual progress, predictability and performance reliability.
    133128\newterm{Eventual progress} guarantees every scheduled thread is eventually run, \ie prevent starvation.
    134 As a hard requirement, the \CFA scheduler must guarantee eventual progress, otherwise the above mentioned illusion of simultaneous execution is broken and the scheduler becomes much more complex to reason about.
    135 \newterm{Predictability} and \newterm{reliability} means similar workloads achieve similar performance and programmer execution intuition is respected.
    136 For example, a thread that yields aggressively should not run more often then other tasks.
     129As a hard requirement, the \CFA scheduler must guarantee eventual progress, otherwise the above-mentioned illusion of simultaneous execution is broken and the scheduler becomes much more complex to reason about.
     130\newterm{Predictability} and \newterm{reliability} mean similar workloads achieve similar performance and programmer execution intuition is respected.
     131For example, a thread that yields aggressively should not run more often than other tasks.
    137132While this is intuitive, it does not hold true for many work-stealing or feedback based schedulers.
    138133The \CFA scheduler must guarantee eventual progress and should be predictable and offer reliable performance.
     
    150145
    151146\begin{enumerate}
    152         \item Threads live long enough for useful feedback information to be to gathered.
     147        \item Threads live long enough for useful feedback information to be gathered.
    153148        \item Threads belong to multiple users so fairness across threads is insufficient.
    154149\end{enumerate}
     
    157152Since \CFA has the explicit goal of allowing many smaller threads, this can naturally lead to threads with much shorter lifetimes that are only scheduled a few times.
    158153Scheduling strategies based on feedback cannot be effective in these cases because there is no opportunity to measure the metrics that underlie the algorithm.
    159 Note, the problem of \newterm{feedback convergence} (reacting too slowly to scheduling events) is not specific to short lived threads but can also occur with threads that show drastic changes in scheduling, \eg threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
     154Note, the problem of \newterm{feedback convergence} (reacting too slowly to scheduling events) is not specific to short-lived threads but can also occur with threads that show drastic changes in scheduling, \eg threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
    160155
    161156In the context of operating systems, these concerns can be overshadowed by a more pressing concern : security.
     
    164159In the case of the \CFA scheduler, every thread runs in the same user space and is controlled by the same user.
    165160Fairness across users is therefore a given and it is then possible to safely ignore the possibility that threads are malevolent.
    166 This approach allows for a much simpler fairness metric and in this proposal \emph{fairness} is defined as: when multiple threads are cycling through the system, the total ordering of threads being scheduled, \ie pushed onto the ready-queue, should not differ much from the total ordering of threads being executed, \ie popped from the ready-queue.
     161This approach allows for a much simpler fairness metric and in this proposal \emph{fairness} is defined as: when multiple threads are cycling through the system, the total ordering of threads being scheduled, \ie pushed onto the ready queue, should not differ much from the total ordering of threads being executed, \ie popped from the ready queue.
    167162
    168163Since feedback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not use per-threads feedback.
     
    172167Another broad category of schedulers are priority schedulers.
    173168In these scheduling strategies, threads have priorities and the runtime schedules the threads with the highest priority before scheduling other threads.
    174 Threads with equal priority are scheduled using a secondary strategy, often something simple like round-robin or FIFO.
     169Threads with equal priority are scheduled using a secondary strategy, often something simple like round robin or FIFO.
    175170A consequence of priority is that, as long as there is a thread with a higher priority that desires to run, a thread with a lower priority does not run.
    176171This possible starving of threads can dramatically increase programming complexity since starving threads and priority inversion (prioritizing a lower priority thread) can both lead to serious problems.
    177172
    178173An important observation is that threads do not need to have explicit priorities for problems to occur.
    179 Indeed, any system with multiple ready-queues that attempts to exhaust one queue before accessing the other queues, essentially provide implicit priority, which can encounter starvation problems.
     174Indeed, any system with multiple ready queues that attempts to exhaust one queue before accessing the other queues, essentially provide implicit priority, which can encounter starvation problems.
    180175For example, a popular scheduling strategy that suffers from implicit priorities is work stealing.
    181176\newterm{Work stealing} is generally presented as follows:
     
    191186
    192187\subsection{Schedulers without feedback or priorities}
    193 This proposal conjectures that is is possible to construct a default scheduler for the \CFA runtime that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about.
     188This proposal conjectures that it is possible to construct a default scheduler for the \CFA runtime that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about.
    194189The simplest fairness guarantee is FIFO ordering, \ie threads scheduled first run first.
    195190However, enforcing FIFO ordering generally conflicts with scalability across multiple processors because of the additional synchronization.
     
    217212Pushing new data is done by selecting one of these underlying queues at random, recording a timestamp for the operation and pushing to the selected queue.
    218213Popping is done by selecting two queues at random and popping from the queue with the oldest timestamp.
    219 A higher number of underlying queues leads to less contention on each queue and therefore better performance.
     214A higher number of underlying queues lead to less contention on each queue and therefore better performance.
    220215In a loaded system, it is highly likely the queues are non-empty, \ie several tasks are on each of the underlying queues.
    221216This means that selecting a queue at random to pop from is highly likely to yield a queue with available items.
     
    224219\begin{figure}
    225220        \begin{center}
    226                 \input{base}
     221                \input{base.pstex_t}
    227222        \end{center}
    228223        \caption{Relaxed FIFO list at the base of the scheduler: an array of strictly FIFO lists.
    229 The timestamp is in all nodes and cell arrays.}
     224        The timestamp is in all nodes and cell arrays.}
    230225        \label{fig:base}
    231226\end{figure}
     
    233228\begin{figure}
    234229        \begin{center}
    235                 \input{empty}
     230                \input{empty.pstex_t}
    236231        \end{center}
    237232        \caption{``More empty'' state of the queue: the array contains many empty cells.}
     
    245240Overall performance is therefore influenced by the contention on the underlying queues and pop performance is influenced by the item density.
    246241
    247 This leads to four performance cases for the centralized ready-queue, as depicted in Table~\ref{tab:perfcases}.
     242This leads to four performance cases for the centralized ready queue, as depicted in Table~\ref{tab:perfcases}.
    248243The number of processors (many or few) refers to the number of kernel threads \emph{actively} attempting to pop user threads from the queues, not the total number of kernel threads.
    249244The number of threads (many or few) refers to the number of user threads ready to be run.
     
    321316Sparse global information helps high-contention cases but increases latency in zero-contention-cases, to read and ``aggregate'' the information\footnote{Hierarchical structures, \eg binary search tree, effectively aggregate information but follow pointer chains, learning information at each node.
    322317Similarly, other sparse schemes need to read multiple cachelines to acquire all the information needed.}.
    323 Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases, however the information can become stale making it difficult to use to ensure correctness.
    324 The fact that these solutions have these fundamental limits suggest to me a better solution that attempts to combine these properties in an interesting ways.
     318Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases. However the information can become stale making it difficult to use to ensure correctness.
     319The fact that these solutions have these fundamental limits suggest to me a better solution that attempts to combine these properties in an interesting way.
    325320Also, the lock discussed in Section~\ref{sec:resize} allows for solutions that adapt to the number of processors, which could also prove useful.
    326321
     
    328323
    329324How much scalability is actually needed is highly debatable.
    330 \emph{libfibre}\cite{libfibre} has compared favorably to other schedulers in webserver tests\cite{Karsten20} and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask.
     325\emph{libfibre}\cite{libfibre} has compared favourably to other schedulers in webserver tests\cite{Karsten20} and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask.
    331326As such, the single atomic instruction on a shared cacheline may be sufficiently performant.
    332327
    333328I have built a prototype of this ready queue in the shape of a data queue, \ie nodes on the queue are structures with a single int representing a thread and intrusive data fields.
    334 Using this prototype I ran preliminary performance experiments that confirm the expected performance in Table~\ref{tab:perfcases}.
     329Using this prototype, I ran preliminary performance experiments that confirm the expected performance in Table~\ref{tab:perfcases}.
    335330However, these experiments only offer a hint at the actual performance of the scheduler since threads form more complex operations than simple integer nodes, \eg threads are not independent of each other, when a thread blocks some other thread must intervene to wake it.
    336331
     
    341336\begin{figure}
    342337        \begin{center}
    343                 \input{system}
     338                \input{system.pstex_t}
    344339        \end{center}
    345340        \caption{Global structure of the \CFA runtime system.}
     
    354349This assumption is made both in the design of the proposed scheduler as well as in the original design of the \CFA runtime system.
    355350As such, the proposed scheduler must honour the correctness of this behaviour but does not have any performance objectives with regard to resizing a cluster.
    356 How long adding or removing processors take and how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long period of times.
     351How long adding or removing processors take and how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long periods of times.
    357352However, as mentioned in Section~\ref{sec:queue}, contention on the underlying queues can have a direct impact on performance.
    358353The number of underlying queues must therefore be adjusted as the number of processors grows or shrinks.
     
    372367This description effectively matches with the description of a reader-writer lock, infrequent but invasive updates among frequent read operations.
    373368In the case of the ready queue described above, read operations are operations that push or pop from the ready queue but do not invalidate any references to the ready queue data structures.
    374 Writes on the other hand would add or remove inner queues, invalidating references to the array of inner queues in a process.
     369Writes, on the other hand, would add or remove inner queues, invalidating references to the array of inner queues in a process.
    375370Therefore, the current proposed approach to this problem is to add a per-cluster reader-writer lock around the ready queue to prevent restructuring of the ready-queue data-structure while threads are being pushed or popped.
    376371
     
    380375
    381376\paragraph{Objectives and Existing Work}
    382 The lock must offer scalability and performance on par with the actual ready-queue in order not to introduce a new bottleneck.
     377The lock must offer scalability and performance on par with the actual ready queue in order not to introduce a new bottleneck.
    383378I have already built a lock that fits the desired requirements and preliminary testing show scalability and performance that exceed the target.
    384379As such, I do not consider this lock to be a risk for this project.
     
    396391give back unneeded CPU time associated with a process to other user processors executing on the computer,
    397392\item
    398 and reduce energy consumption in cases where more idle kernel-threads translate to idle CPUs, which can cycle down.
     393and reduce energy consumption in cases where more idle kernel-threads translate into idle CPUs, which can cycle down.
    399394\end{enumerate}
    400395Support for idle sleep broadly involves calling the operating system to block the kernel thread and handling the race between a blocking thread and the waking thread, and handling which kernel thread should sleep or wake up.
     
    403398This operation is equivalent to the classic problem of missing signals when using condition variables: the ``sleepy'' processor indicates its intention to block but has not yet gone to sleep when another processor attempts to wake it up.
    404399The waking-up operation sees the blocked process and signals it, but the blocking process is racing to sleep so the signal is missed.
    405 In cases where kernel threads are managed as processors on the current cluster, loosing signals is not necessarily critical, because at least some processors on the cluster are awake and may check for more processors eventually.
     400In cases where kernel threads are managed as processors on the current cluster, losing signals is not necessarily critical, because at least some processors on the cluster are awake and may check for more processors eventually.
    406401Individual processors always finish scheduling user threads before looking for new work, which means that the last processor to go to sleep cannot miss threads scheduled from inside the cluster (if they do, that demonstrates the ready queue is not linearizable).
    407402However, this guarantee does not hold if threads are scheduled from outside the cluster, either due to an external event like timers and I/O, or due to a user (or kernel) thread migrating from a different cluster.
     
    413408
    414409Another important issue is avoiding kernel threads sleeping and waking frequently because there is a significant operating-system cost.
    415 This scenario happens when a program oscillates between high and low activity, needing most and then less processors.
     410This scenario happens when a program oscillates between high and low activity, needing most and then fewer processors.
    416411A possible partial solution is to order the processors so that the one which most recently went to sleep is woken up.
    417412This allows other sleeping processors to reach deeper sleep state (when these are available) while keeping ``hot'' processors warmer.
     
    420415
    421416A final important aspect of idle sleep is when should processors make the decision to sleep and when is it appropriate for sleeping processors to be woken up.
    422 Processors that are unnecessarily unblocked lead to unnecessary contention, CPU usage, and power consumption, while too many sleeping processors can lead to sub-optimal throughput.
    423 Furthermore, transitions from sleeping to awake and vice-versa also add unnecessary latency.
     417Processors that are unnecessarily unblocked lead to unnecessary contention, CPU usage, and power consumption, while too many sleeping processors can lead to suboptimal throughput.
     418Furthermore, transitions from sleeping to awake and vice versa also add unnecessary latency.
    424419There is already a wealth of research on the subject\cite{schillings1996engineering, wiki:thunderherd} and I may use an existing approach for the idle-sleep heuristic in this project, \eg\cite{Karsten20}.
    425420
     
    430425It is preferable to block the user thread performing the I/O and reuse the underlying kernel-thread to run other ready user threads.
    431426This approach requires intercepting user-thread calls to I/O operations, redirecting them to an asynchronous I/O interface, and handling the multiplexing/demultiplexing between the synchronous and asynchronous API.
    432 As such, there are three components needed to implemented support for asynchronous I/O:
     427As such, there are three components needed to implement support for asynchronous I/O:
    433428\begin{enumerate}
    434429\item
     
    446441It is sufficient to make one work in the complex context of the \CFA runtime.
    447442\uC uses the $select$\cite{select} as its interface, which handles ttys, pipes and sockets, but not disk.
    448 $select$ entails significant complexity and is being replaced in UNIX operating-systems, which make it a less interesting alternative.
     443$select$ entails significant complexity and is being replaced in UNIX operating systems, which make it a less interesting alternative.
    449444Another popular interface is $epoll$\cite{epoll}, which is supposed to be cheaper than $select$.
    450 However, $epoll$ also does not handle the file system and anectodal evidence suggest it has problem with linux pipes and $TTY$s.
     445However, $epoll$ also does not handle the file system and anecdotal evidence suggest it has problems with Linux pipes and $TTY$s.
    451446A popular cross-platform alternative is $libuv$\cite{libuv}, which offers asynchronous sockets and asynchronous file system operations (among other features).
    452447However, as a full-featured library it includes much more than I need and could conflict with other features of \CFA unless significant effort is made to merge them together.
    453448A very recent alternative that I am investigating is $io_uring$\cite{io_uring}.
    454 It claims to address some of the issues with $epoll$ and my early investigating suggest that the claim is accurate.
    455 $io_uring$ uses a much more general approach where system calls are register to a queue and later executed by the kernel, rather than relying on system calls to return an error instead of blocking and subsequently waiting for changes on file descriptors.
     449It claims to address some of the issues with $epoll$ and my early investigating suggests that the claim is accurate.
     450$io_uring$ uses a much more general approach where system calls are registered to a queue and later executed by the kernel, rather than relying on system calls to return an error instead of blocking and subsequently waiting for changes on file descriptors.
    456451I believe this approach allows for fewer problems, \eg the manpage for $open$\cite{open} states:
    457452\begin{quote}
    458         Note that [the $O_NONBLOCK$ flag] has no effect for regular files and block devices;
    459         that is, I/O operations will (briefly) block when device activity is required, regardless of whether $O_NONBLOCK$ is set.
    460         Since $O_NONBLOCK$ semantics might eventually be implemented, applications should not depend upon blocking behavior when specifying this flag for regular files and block devices.
     453Note that [the $O_NONBLOCK$ flag] has no effect for regular files and block devices;
     454that is, I/O operations will (briefly) block when device activity is required, regardless of whether $O_NONBLOCK$ is set.
     455Since $O_NONBLOCK$ semantics might eventually be implemented, applications should not depend upon blocking behaviour when specifying this flag for regular files and block devices.
    461456\end{quote}
    462457This makes approach based on $epoll$/$select$ less reliable since they may not work for every file descriptors.
    463 For this reason, I plan to use $io_uring$ as the OS abstraction for the \CFA runtime, unless further work shows problems I haven't encountered yet.
     458For this reason, I plan to use $io_uring$ as the OS abstraction for the \CFA runtime unless further work shows problems I haven't encountered yet.
    464459However, only a small subset of the features are available in Ubuntu as of April 2020\cite{wiki:ubuntu-linux}, which will limit performance comparisons.
    465460I do not believe this will affect the comparison result.
     
    490485\section{Discussion}
    491486I believe that runtime system and scheduling are still open topics.
    492 Many ``state of the art'' production frameworks still use single threaded event-loops because of performance considerations, \eg \cite{nginx-design}, and, to my knowledge, no wideyl available system language offers modern threading facilities.
     487Many ``state of the art'' production frameworks still use single-threaded event loops because of performance considerations, \eg \cite{nginx-design}, and, to my knowledge, no widely available system language offers modern threading facilities.
    493488I believe the proposed work offers a novel runtime and scheduling package, where existing work only offers fragments that users must assemble themselves when possible.
    494489
     
    501496\hline November 2020 & March 2021   & Completion of the implementation. \\
    502497\hline March 2021 & April 2021  & Final Performance experiments. \\
    503 \hline May 2021 & August 2021 & Thesis writing and defense. \\
     498\hline May 2021 & August 2021 & Thesis writing and defence. \\
    504499\hline
    505500\end{tabular}
  • doc/theses/thierry_delisle_PhD/comp_II/img/base.fig

    r68f0c4e r70cab43  
    1 #FIG 3.2  Produced by xfig version 3.2.5c
     1#FIG 3.2  Produced by xfig version 3.2.7b
    22Landscape
    33Center
    44Inches
    5 Letter 
     5Letter
    66100.00
    77Single
     
    13131 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
    1414-6
    15 6 2400 2100 3000 2700
    16 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 2400 300 300 2700 2400 3000 2400
    17 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    18          2400 2475 3000 2475
    19 4 1 0 50 -1 0 11 0.0000 2 120 210 2700 2650 TS\001
    20 -6
    21 6 2400 3000 3000 3600
    22 1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 300 300 2700 3300 3000 3300
    23 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    24          2400 3375 3000 3375
    25 4 1 0 50 -1 0 11 0.0000 2 120 210 2700 3550 TS\001
    26 -6
    27151 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 2400 300 300 3900 2400 4200 2400
    28161 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 3300 300 300 3900 3300 4200 3300
     
    33211 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 3300 300 300 6300 3300 6600 3300
    34221 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4509 3302 300 300 4509 3302 4809 3302
     231 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 300 300 2700 3300 3000 3300
     241 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 2400 300 300 2700 2400 3000 2400
    35252 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
    3626         3000 3900 3000 4500
     
    7363        1 1 1.00 45.00 90.00
    7464         6300 4200 6300 3600
    75 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
     652 1 0 1 -1 7 50 -1 -1 0.000 0 0 -1 1 0 2
    7666        1 1 1.00 45.00 90.00
    7767         2700 4200 2700 3600
     
    8171        1 1 1.00 45.00 90.00
    8272         4500 4200 4500 3600
    83 4 2 0 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
    84 4 2 0 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
    85 4 2 0 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
    86 4 2 0 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001
    87 4 1 0 50 -1 0 11 0.0000 2 120 210 2700 4450 TS\001
     732 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
     74         2400 3375 3000 3375
     752 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
     76         2400 2475 3000 2475
     774 2 -1 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001
     784 2 -1 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001
     794 1 -1 50 -1 0 11 0.0000 2 135 180 2700 4450 TS\001
     804 2 -1 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001
     814 2 -1 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001
     824 1 -1 50 -1 0 11 0.0000 2 135 180 2700 3550 TS\001
     834 1 -1 50 -1 0 11 0.0000 2 135 180 2700 2650 TS\001
  • doc/theses/thierry_delisle_PhD/comp_II/img/empty.fig

    r68f0c4e r70cab43  
    1 #FIG 3.2  Produced by xfig version 3.2.5c
     1#FIG 3.2  Produced by xfig version 3.2.7b
    22Landscape
    33Center
    44Inches
    5 Letter 
     5Letter
    66100.00
    77Single
     
    4141        1 1 1.00 45.00 90.00
    4242         2700 4200 2700 3600
    43 4 2 0 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
    44 4 2 0 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
    45 4 2 0 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
    46 4 2 0 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001
     434 2 -1 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001
     444 2 -1 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001
     454 2 -1 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001
     464 2 -1 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001
  • doc/theses/thierry_delisle_PhD/comp_II/img/system.fig

    r68f0c4e r70cab43  
    33Center
    44Inches
    5 Letter 
     5Letter
    66100.00
    77Single
     
    1651654 1 -1 0 0 0 10 0.0000 2 105 990 2175 3525 Discrete-event\001
    1661664 1 -1 0 0 0 10 0.0000 2 135 795 2175 4350 preemption\001
    167 4 0 -1 0 0 0 10 0.0000 2 150 1290 2325 4875 genrator/coroutine\001
     1674 0 -1 0 0 0 10 0.0000 2 150 1290 2325 4875 generator/coroutine\001
    1681684 0 -1 0 0 0 10 0.0000 2 120 270 4050 4875 task\001
    1691694 0 -1 0 0 0 10 0.0000 2 105 450 7050 4875 cluster\001
  • libcfa/src/bits/locks.hfa

    r68f0c4e r70cab43  
    218218        }
    219219
    220         // Semaphore which only supports a single thread and one post
    221         // Semaphore which only supports a single thread
     220        // Synchronozation primitive which only supports a single thread and one post
     221        // Similar to a binary semaphore with a 'one shot' semantic
     222        // is expected to be discarded after each party call their side
    222223        struct oneshot {
     224                // Internal state :
     225                //     0p     : is initial state (wait will block)
     226                //     1p     : fulfilled (wait won't block)
     227                // any thread : a thread is currently waiting
    223228                struct $thread * volatile ptr;
    224229        };
     
    231236                void ^?{}(oneshot & this) {}
    232237
     238                // Wait for the post, return immidiately if it already happened.
     239                // return true if the thread was parked
    233240                bool wait(oneshot & this) {
    234241                        for() {
     
    244251                }
    245252
     253                // Mark as fulfilled, wake thread if needed
     254                // return true if a thread was unparked
    246255                bool post(oneshot & this) {
    247256                        struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
     
    251260                }
    252261        }
     262
     263        // base types for future to build upon
     264        // It is based on the 'oneshot' type to allow multiple futures
     265        // to block on the same instance, permitting users to block a single
     266        // thread on "any of" [a given set of] futures.
     267        // does not support multiple threads waiting on the same future
     268        struct future_t {
     269                // Internal state :
     270                //     0p      : is initial state (wait will block)
     271                //     1p      : fulfilled (wait won't block)
     272                //     2p      : in progress ()
     273                //     3p      : abandoned, server should delete
     274                // any oneshot : a context has been setup to wait, a thread could wait on it
     275                struct oneshot * volatile ptr;
     276        };
     277
     278        static inline {
     279                void  ?{}(future_t & this) {
     280                        this.ptr = 0p;
     281                }
     282
     283                void ^?{}(future_t & this) {}
     284
     285                // check if the future is available
     286                bool available( future_t & this ) {
     287                        return this.ptr == 1p;
     288                }
     289
     290                // Prepare the future to be waited on
     291                // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
     292                bool setup( future_t & this, oneshot & wait_ctx ) {
     293                        /* paranoid */ verify( wait_ctx.ptr == 0p );
     294                        // The future needs to set the wait context
     295                        for() {
     296                                struct oneshot * expected = this.ptr;
     297                                // Is the future already fulfilled?
     298                                if(expected == 1p) return false; // Yes, just return false (didn't block)
     299
     300                                // The future is not fulfilled, try to setup the wait context
     301                                /* paranoid */ verify( expected == 0p );
     302                                if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
     303                                        return true;
     304                                }
     305                        }
     306                }
     307
     308                // Stop waiting on a future
     309                // When multiple futures are waited for together in "any of" pattern
     310                // futures that weren't fulfilled before the thread woke up
     311                // should retract the wait ctx
     312                // intented to be use by wait, wait_any, waitfor, etc. rather than used directly
     313                void retract( future_t & this, oneshot & wait_ctx ) {
     314                        // Remove the wait context
     315                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
     316
     317                        // got == 0p: future was never actually setup, just return
     318                        if( got == 0p ) return;
     319
     320                        // got == wait_ctx: since fulfil does an atomic_swap,
     321                        // if we got back the original then no one else saw context
     322                        // It is safe to delete (which could happen after the return)
     323                        if( got == &wait_ctx ) return;
     324
     325                        // got == 1p: the future is ready and the context was fully consumed
     326                        // the server won't use the pointer again
     327                        // It is safe to delete (which could happen after the return)
     328                        if( got == 1p ) return;
     329
     330                        // got == 2p: the future is ready but the context hasn't fully been consumed
     331                        // spin until it is safe to move on
     332                        if( got == 2p ) {
     333                                while( this.ptr != 1p ) Pause();
     334                                return;
     335                        }
     336
     337                        // got == any thing else, something wen't wrong here, abort
     338                        abort("Future in unexpected state");
     339                }
     340
     341                // Mark the future as abandoned, meaning it will be deleted by the server
     342                void abandon( future_t & this ) {
     343                        struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
     344
     345                        // got == 2p: the future is ready but the context hasn't fully been consumed
     346                        // spin until it is safe to move on
     347                        if( got == 2p ) {
     348                                while( this.ptr != 1p ) Pause();
     349                        }
     350                        return;
     351                }
     352
     353                // from the server side, mark the future as fulfilled
     354                // delete it if needed
     355                bool fulfil( future_t & this ) {
     356                        for() {
     357                                struct oneshot * expected = this.ptr;
     358                                // was this abandoned?
     359                                if( expected == 3p ) { free( &this ); return false; }
     360
     361                                /* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
     362                                /* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
     363
     364                                // If there is a wait context, we need to consume it and mark it as consumed after
     365                                // If there is no context then we can skip the in progress phase
     366                                struct oneshot * want = expected == 0p ? 1p : 2p;
     367                                if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
     368                                        if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
     369                                        bool ret = post( *expected );
     370                                        __atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
     371                                        return ret;
     372                                }
     373                        }
     374
     375                }
     376
     377                // Wait for the future to be fulfilled
     378                bool wait( future_t & this ) {
     379                        oneshot temp;
     380                        if( !setup(this, temp) ) return false;
     381
     382                        // Wait context is setup, just wait on it
     383                        bool ret = wait( temp );
     384
     385                        // Wait for the future to tru
     386                        while( this.ptr == 2p ) Pause();
     387                        // Make sure the state makes sense
     388                        // Should be fulfilled, could be in progress but it's out of date if so
     389                        // since if that is the case, the oneshot was fulfilled (unparking this thread)
     390                        // and the oneshot should not be needed any more
     391                        __attribute__((unused)) struct oneshot * was = this.ptr;
     392                        /* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
     393
     394                        // Mark the future as fulfilled, to be consistent
     395                        // with potential calls to avail
     396                        // this.ptr = 1p;
     397                        return ret;
     398                }
     399        }
    253400#endif
Note: See TracChangeset for help on using the changeset viewer.