Changeset bcd74f3

doc/bibliography/pl.bib

-              re3bc51c
+              rbcd74f3
+}
+@misc{libfibre,
+    key         = {libfibre},
+    author      = {Martin Karsten},
+    title       = {{libfibre:~User-Level Threading Runtime}},
+    howpublished= {\href{https://git.uwaterloo.ca/mkarsten/libfibre}
+                  {https://\-git.uwaterloo.ca/\-mkarsten/\-libfibre}},
+    note        = {[Online; accessed 2020-04-15]},
+}
 @article{Linda,
     keywords    = {Linda, concurrency},
 …
     address     = {Belmont},
     year        = 1967,
+}
+@inproceedings{Fang06,
+    author      = {Fang, Yi and McMillan, Kenneth L. and Pnueli, Amir and Zuck, Lenore D.},
+    editor      = {Najm, Elie and Pradat-Peyre, Jean-Fran{\c{c}}ois and Donzeau-Gouge, V{\'e}ronique Vigui{\'e}},
+    title       = {Liveness by Invisible Invariants},
+    booktitle   = {Formal Techniques for Networked and Distributed Systems - FORTE 2006},
+    year        = 2006,
+    publisher   = {Springer Berlin Heidelberg},
+    address     = {Berlin, Heidelberg},
+    pages       = {356--371},
+}
 …
     contributer = {pabuhr@plg},
     author      = {Gregory R. Andrews},
     title       = {A Method for Solving Synronization Problems},
+    title       = {A Method for Solving Synchronization Problems},
     journal     = scp,
     volume      = 13,
 …
     title       = {Multiple Inheritance for {C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}}},
     booktitle   = {Proceedings of the Spring '87 EUUG Conference},
+    month       = may, year = 1987
+    month       = may,
+    year        = 1987,
+}
 …
+}
+@article{Aravind09,
+    author      = {Alex A. Aravind and Wim H. Hesselink},
+    title       = {A Queue Based Mutual Exclusion Algorithm},
+    journal     = acta,
+    volume      = 46,
+    pages       = {73--86},
+    year        = 2009,
+}
 % R
 …
+}
+@article{Karsten20,
+    author      = {Karsten, Martin and Barghi, Saman},
+    title       = {{User-level Threading: Have Your Cake and Eat It Too}},
+    year        = {2020},
+    issue_date  = {March 2020},
+    publisher   = {Association for Computing Machinery},
+    address     = {New York, NY, USA},
+    volume      = {4},
+    number      = {1},
+    url         = {https://doi.org/10.1145/3379483},
+    doi         = {10.1145/3379483},
+    journal     = {Proc. ACM Meas. Anal. Comput. Syst.},
+    month       = mar,
+    numpages    = {30},
+}
 @techreport{Harmony,
     keywords    = {messages, concurrency},
 …
     contributer = {gjditchfield@plg},
     author      = {Henry Lieverman},
+    title       = {Using Prototypical Objects to Implement Shared Behavior in
+                  Object Oriented Systems},
+    title       = {Using Prototypical Objects to Implement Shared Behavior in Object Oriented Systems},
     journal     = sigplan,
+    month       = nov, year = 1986,
+    volume      = 21, number = 11, pages = {214-223}
+    month       = nov,
+    year        = 1986,
+    volume      = 21,
+    number      = 11,
+    pages       = {214-223}
+}

doc/theses/thierry_delisle_PhD/comp_II/Makefile

re3bc51c	rbcd74f3
2	2
3	3	Build = build
4		Figures = ~~figures~~
	4	Figures = img
5	5	Macros = ../../../LaTeXmacros
6	6	TeXLIB = .:${Macros}:${Build}:../../../bibliography:

doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex

-              re3bc51c
+              rbcd74f3
 \usepackage[T1]{fontenc}
 \usepackage[utf8]{inputenc}
-\usepackage{listings}           % for code listings
 \usepackage{xspace}
 \usepackage{xcolor}
 \usepackage{graphicx}
 \usepackage{epic,eepic}
+\usepackage{listings}                   % for code listings
 \usepackage{glossaries}
 \usepackage{textcomp}
+% cfa macros used in the document
+\input{common}
+\setlist{topsep=6pt,parsep=0pt}         % global reduce spacing between points
+\newcommand{\uC}{$\mu$\CC}
 \usepackage[hidelinks]{hyperref}
+\setlength{\abovecaptionskip}{5pt plus 3pt minus 2pt}
+\lstMakeShortInline$%                   % single-character for \lstinline
 %\usepackage[margin=1in]{geometry}
 %\usepackage{float}
-% cfa macros used in the document
-\input{common}
 \input{glossary}
 …
 \author{
         \huge Thierry Delisle \\
         \Large \vspace*{0.1in} \texttt{tdelisle@uwaterloo.ca} \\
+        \huge Thierry Delisle \vspace*{5pt} \\
+        \Large \texttt{tdelisle@uwaterloo.ca} \vspace*{5pt} \\
         \Large Cheriton School of Computer Science \\
         \Large University of Waterloo
 …
 \newcommand{\cit}{\textsuperscript{[Citation Needed]}\xspace}
 \newcommand{\TODO}{~\newline{\large\bf\color{red} TODO :}\xspace}
+\newcommand{\TODO}{{\large\bf\color{red} TODO: }\xspace}
 % ===============================================================================
 …
 \section{Introduction}
 \subsection{\CFA and the \CFA concurrency package}
+\CFA\cit is a modern, polymorphic, non-object-oriented, backwards-compatible extension of the C programming language. It aims to add high-productivity features while maintaning the predictible performance of C. As such, concurrency in \CFA\cit aims to offer simple and safe high-level tools while still allowing performant code. \CFA concurrrent code is written in the synchronous programming paradigm but uses \glspl{uthrd} in order to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programing. As such, the \CFA \emph{scheduler} is a preemptive user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}.
+Scheduling occurs when execution switches from one thread to another, where the second thread is implicitly chosen by the scheduler. This scheduling is an indirect handoff, as opposed to generators and coroutines which explicitly switch to the next generator and coroutine respectively. The cost of switching between two threads for an indirect handoff has two components : the cost of actually context-switching, i.e., changing the relevant registers to move execution from one thread to the other, and the cost of scheduling, i.e., deciding which thread to run next among all the threads ready to run. The first cost is generally constant and fixed\footnote{Affecting the context-switch cost is whether it is done in one step, after the scheduling, or in two steps, context-switching to a fixed third-thread before scheduling.}, while the scheduling cost can vary based on the system state. Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, i.e. \textit{linearizability}, and a new dimension to performance: scalability, where scheduling cost now also depends on contention.
+The more threads switch, the more the administration cost of scheduling becomes noticeable. It is therefore important to build a scheduler with the lowest possible cost and latency. Another important consideration is \emph{fairness}. In principle, scheduling should give the illusion of perfect fairness, where all threads ready to run are running \emph{simultaneously}. While the illusion of simultaneity is easier to reason about, it can break down if the scheduler allows to much unfairness. Therefore, the scheduler should offer as much fairness as needed to guarantee eventual progress, but use unfairness to help performance. In practice, threads must wait in turn but there can be advantages to unfair scheduling, similar to the the express cash register at a grocery store.
+The goal of this research is to produce a scheduler that is simple for programmers to understand and offers good performance. Here understandability does not refer to the API but to how much scheduling concerns programmers need to take into account when writing a \CFA concurrent package. Therefore, the main goal of this proposal is :
+\CFA\cite{Moss18} is a modern, polymorphic, non-object-oriented, concurrent, backwards-compatible extension of the C programming language.
+It aims to add high-productivity features while maintaining the predictable performance of C.
+As such, concurrency in \CFA\cite{Delisle19} aims to offer simple and safe high-level tools while still allowing performant code.
+\CFA concurrent code is written in the synchronous programming paradigm but uses \glspl{uthrd} in order to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programing.
+As such, the \CFA \newterm{scheduler} is a preemptive user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}.
+\newterm{Scheduling} occurs when execution switches from one thread to another, where the second thread is implicitly chosen by the scheduler.
+This scheduling is an indirect handoff, as opposed to generators and coroutines which explicitly switch to the next generator and coroutine respectively.
+The cost of switching between two threads for an indirect handoff has two components:
+\begin{enumerate}
+\item
+the cost of actually context-switching, \ie changing the relevant registers to move execution from one thread to the other,
+\item
+and the cost of scheduling, \ie deciding which thread to run next among all the threads ready to run.
+\end{enumerate}
+The first cost is generally constant and fixed\footnote{Affecting the constant context-switch cost is whether it is done in one step, after the scheduling, or in two steps, context-switching to a fixed third-thread before scheduling.}, while the scheduling cost can vary based on the system state.
+Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, \ie \newterm{linearizability}\footnote{Meaning however fast the CPU threads run, there is an equivalent sequential order that gives the same result.}, and a new dimension to performance: scalability, where scheduling cost now also depends on contention.
+The more threads switch, the more the administration cost of scheduling becomes noticeable.
+It is therefore important to build a scheduler with the lowest possible cost and latency.
+Another important consideration is \newterm{fairness}.
+In principle, scheduling should give the illusion of perfect fairness, where all threads ready to run are running \emph{simultaneously}.
+While the illusion of simultaneity is easier to reason about, it can break down if the scheduler allows too much unfairness.
+Therefore, the scheduler should offer as much fairness as needed to guarantee eventual progress, but use unfairness to help performance.
+In practice, threads must wait in turn but there can be advantages to unfair scheduling, similar to the the express cash-register at a grocery store.
+The goal of this research is to produce a scheduler that is simple for programmers to understand and offers good performance.
+Here understandability does not refer to the API but to how much scheduling concerns programmers need to take into account when writing a \CFA concurrent package.
+Therefore, the main goal of this proposal is :
 \begin{quote}
 The \CFA scheduler should be \emph{viable} for \emph{any} workload.
 \end{quote}
+For a general purpose scheduler, it is impossible to produce an optimal algorithm as it would require knowledge of the future behaviour of threads. As such, scheduling performance is generally either defined by the best case scenario, i.e., a workload to which the scheduler is tailored, or the worst case scenario, i.e., the scheduler behaves no worst than \emph{X}. For this proposal, the performance is evaluated using the second approach to allow \CFA programmers to rely on scheduling performance. Be cause there is no optimal scheduler, ultimately \CFA may allow programmers to write their own scheduler; but that is not the subject of this proposal, which considers only the default scheduler. As such, it is important that only programmers with exceptionally high performance requirements should need to write their own scheduler and replace the scheduler in this proposal.
+Finally, the scheduling objective includes producing a scheduling strategy with sufficient fairness guarantees, creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily, scheduling blocking I/O operations, and writing sufficient library tools to allow developers to indirectly use the scheduler.
+For a general purpose scheduler, it is impossible to produce an optimal algorithm as it would require knowledge of the future behaviour of threads.
+As such, scheduling performance is generally either defined by the best case scenario, \ie a workload to which the scheduler is tailored, or the worst case scenario, \ie the scheduler behaves no worst than \emph{X}.
+For this proposal, the performance is evaluated using the second approach to allow \CFA programmers to rely on scheduling performance.
+Because there is no optimal scheduler, ultimately \CFA may allow programmers to write their own scheduler; but that is not the subject of this proposal, which considers only the default scheduler.
+As such, it is important that only programmers with exceptionally high performance requirements should need to write their own scheduler and replace the scheduler in this proposal.
+To achieve the \CFA scheduling goal includes:
+\begin{enumerate}
+\item
+producing a scheduling strategy with sufficient fairness guarantees,
+\item
+creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily,
+\item
+scheduling blocking I/O operations,
+\item
+and writing sufficient library tools to allow developers to indirectly use the scheduler, either through tuning knobs or replacing the default scheduler.
+\end{enumerate}
 % ===============================================================================
 …
 \section{\CFA Scheduling}
+To scheduler user-level threads across all workloads, the scheduler has a number of requirements:
+\paragraph{Correctness} As with any other concurrent data structure or algorithm, the correctness requirement is paramount. The scheduler cannot allow threads to be dropped from the ready-queue, i.e., scheduled but never run, or be executed multiple times when only being scheduled once. Since \CFA concurrency has no spurious wakeup, this definition of correctness also means the scheduler should have no spurious wakeup. The \CFA scheduler must be correct.
+\paragraph{Performance} The performance of a scheduler can generally be mesured in terms of scheduling cost, scalability and latency. Scheduling cost is the cost to switch from one thread to another, as mentioned above. For simple applications where a single kernel thread does most of the scheduling, it is generally the dominating cost. When adding many kernel threads, scalability becomes an issue, effectively increasing the cost of context-switching when contention is high. Finally, a third axis of performance is tail latency. This measurement is related to fairness and mesures how long is needed for a thread to be run once scheduled and is evaluated in the worst cases. The \CFA scheduler should offer good performance in all three metrics.
+\paragraph{Fairness} Like performance, this requirements has several aspect : eventual progress, predictability and performance reliablility. As a hard requirement, the \CFA scheduler must guarantee eventual progress, i.e., prevent starvation, otherwise the above mentioned illusion of simultaneous execution is broken and the scheduler becomes much more complex to reason about. Beyond this requirement, performance should be predictible and reliable, which means similar workloads achieve similar performance and programmer intuition is respected. An example of this is : a thread that yields agressively should not run more often then other tasks. While this is intuitive, it does not hold true for many work-stealing or feedback based schedulers. The \CFA scheduler must guarantee eventual progress and should be predictible and offer reliable performance.
+\paragraph{Efficiency} Finally, efficient usage of CPU resources is also an important requirement. This issue is discussed more in depth towards the end of this proposal. It effectively refers to avoiding using CPU power when there are no threads to run, and conversely, use all CPUs available when the workload can benefit from it. Balancing these two states is where the complexity lies. The \CFA scheduler should be efficient with respect to the underlying (shared) computer.
+To schedule user-level threads across all workloads, the scheduler has a number of requirements:
+\paragraph{Correctness} As with any other concurrent data structure or algorithm, the correctness requirement is paramount.
+The scheduler cannot allow threads to be dropped from the ready queue, \ie scheduled but never run, or be executed multiple times when only being scheduled once.
+Since \CFA concurrency has no spurious wakeup, this definition of correctness also means the scheduler should have no spurious wakeup.
+The \CFA scheduler must be correct.
+\paragraph{Performance} The performance of a scheduler can generally be measured in terms of scheduling cost, scalability and latency.
+\newterm{Scheduling cost} is the cost to switch from one thread to another, as mentioned above.
+For simple applications, where a single kernel thread does most of the scheduling, it is generally the dominating cost.
+\newterm{Scalability} is the cost of adding multiple kernel threads because it increases the time for context switching because of contention by multiple threads accessing shared resources, \eg the ready queue.
+Finally, \newterm{tail latency} is service delay and relates to thread fairness.
+Specifically, latency measures how long a thread waits to run once scheduled and is evaluated in the worst case.
+The \CFA scheduler should offer good performance for all three metrics.
+\paragraph{Fairness} Like performance, this requirement has several aspect : eventual progress, predictability and performance reliability.
+\newterm{Eventual progress} guarantees every scheduled thread is eventually run, \ie prevent starvation.
+As a hard requirement, the \CFA scheduler must guarantee eventual progress, otherwise the above mentioned illusion of simultaneous execution is broken and the scheduler becomes much more complex to reason about.
+\newterm{Predictability} and \newterm{reliability} means similar workloads achieve similar performance and programmer execution intuition is respected.
+For example, a thread that yields aggressively should not run more often then other tasks.
+While this is intuitive, it does not hold true for many work-stealing or feedback based schedulers.
+The \CFA scheduler must guarantee eventual progress and should be predictable and offer reliable performance.
+\paragraph{Efficiency} Finally, efficient usage of CPU resources is also an important requirement and is discussed in depth towards the end of the proposal.
+\newterm{Efficiency} means avoiding using CPU cycles when there are no threads to run, and conversely, use all CPUs available when the workload can benefit from it.
+Balancing these two states is where the complexity lies.
+The \CFA scheduler should be efficient with respect to the underlying (shared) computer.
 \bigskip To achieve these requirements, I can reject two broad types of scheduling strategies : feedback-based and priority schedulers.
 \subsection{Feedback-Based Schedulers}
+Many operating systems use schedulers based on feedback in some form, e.g., measuring how much CPU a particular thread has used\footnote{Different metrics can measured here but it is not relevant to the discussion.} and schedule threads based on this metric. These strategies are sensible for operating systems but rely on two assumptions on the workload :
+Many operating systems use schedulers based on feedback in some form, \eg measuring how much CPU a particular thread has used\footnote{Different metrics can be measured but it is not relevant to the discussion.} and schedule threads based on this metric.
+These strategies are sensible for operating systems but rely on two assumptions for the workload:
 \begin{enumerate}
 …
 \end{enumerate}
+While these two assumptions generally hold for operating systems, they may not for user-level threading. Since \CFA has the explicit goal of allowing many smaller threads, this can naturally lead to threads with much shorter lifetime, which are only scheduled a few times. Scheduling strategies based on feedback cannot be effective in these cases because they do not have the opportunity to measure the metrics that underlie the algorithm. Note that the problem of feedback convergence (reacting too slowly to scheduling events) is not specific to short lived threads but can also occur with threads that show drastic changes in scheduling, e.g., threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
+In the context of operating systems, these concerns can be overshadowed by a more pressing concern : security. When multiple users are involved, it is possible that some users are malevolent and try to exploit the scheduling strategy in order to achieve some nefarious objective. Security concerns mean that more precise and robust fairness metrics must be used to guarantee fairness across processes created by users as well as threads created within a process. In the case of the \CFA scheduler, every thread runs in the same user-space and is controlled by the same user. Fairness across users is therefore a given and it is then possible to safely ignore the possibility that threads are malevolent. This approach allows for a much simpler fairness metric and in this proposal ``fairness'' is considered as follows : when multiple threads are cycling through the system, the total ordering of threads being scheduled, i.e., pushed onto the ready-queue, should not differ much from the total ordering of threads being executed, i.e., popped from the ready-queue.
+Since feedback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not use per-threads feedback. Feedback in general is not rejected for secondary concerns like idle sleep for kernel threads, but no feedback is used to decide which thread to run next.
+While these two assumptions generally hold for operating systems, they may not for user-level threading.
+Since \CFA has the explicit goal of allowing many smaller threads, this can naturally lead to threads with much shorter lifetimes that are only scheduled a few times.
+Scheduling strategies based on feedback cannot be effective in these cases because there is no opportunity to measure the metrics that underlie the algorithm.
+Note, the problem of \newterm{feedback convergence} (reacting too slowly to scheduling events) is not specific to short lived threads but can also occur with threads that show drastic changes in scheduling, \eg threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
+In the context of operating systems, these concerns can be overshadowed by a more pressing concern : security.
+When multiple users are involved, it is possible some users are malevolent and try to exploit the scheduling strategy to achieve some nefarious objective.
+Security concerns mean more precise and robust fairness metrics must be used to guarantee fairness across processes created by users as well as threads created within a process.
+In the case of the \CFA scheduler, every thread runs in the same user space and is controlled by the same user.
+Fairness across users is therefore a given and it is then possible to safely ignore the possibility that threads are malevolent.
+This approach allows for a much simpler fairness metric and in this proposal \emph{fairness} is defined as: when multiple threads are cycling through the system, the total ordering of threads being scheduled, \ie pushed onto the ready-queue, should not differ much from the total ordering of threads being executed, \ie popped from the ready-queue.
+Since feedback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not use per-threads feedback.
+Feedback in general is not rejected for secondary concerns like idle sleep for kernel threads, but no feedback is used to decide which thread to run next.
 \subsection{Priority Schedulers}
+Another broad category of schedulers are priority schedulers. In these scheduling strategies, threads have priorities and the runtime schedules the threads with the highest priority before scheduling other threads. Threads with equal priority are scheduled using a secondary strategy, often something simple like round-robin or FIFO. These priority mean that, as long as there is a thread with a higher priority that desires to run, a thread with a lower priority does not run. This possible starving of threads can dramatically increase programming complexity since starving threads and priority inversion (prioritizing a lower priority thread) can both lead to serious problems.
+An important observation to make is that threads do not need to have explicit priorities for problems to occur. Indeed, any system with multiple ready-queues and attempts to exhaust one queue before accessing the other queues, can encounter starvation problems. A popular scheduling strategy that suffers from implicit priorities is work-stealing. Work-stealing is generally presented as follows, each processor has a list of ready threads.
+\begin{enumerate}
+        \item Run threads from ``this'' processor's list first.
+        \item If ``this'' processor's list is empty, run threads from some other processor's list.
+\end{enumerate}
+In a loaded system\footnote{A loaded system is a system where threads are being run at the same rate they are scheduled.}, if a thread does not yield, block or preempt for an extended period of time, threads on the same processor's list starve if no other processors exhaust their list.
+Since priorities can be complex for programmers to handle, the scheduling strategy proposed for the \CFA runtime does not use a strategy with either implicit or explicit thread priorities.
+Another broad category of schedulers are priority schedulers.
+In these scheduling strategies, threads have priorities and the runtime schedules the threads with the highest priority before scheduling other threads.
+Threads with equal priority are scheduled using a secondary strategy, often something simple like round-robin or FIFO.
+A consequence of priority is that, as long as there is a thread with a higher priority that desires to run, a thread with a lower priority does not run.
+This possible starving of threads can dramatically increase programming complexity since starving threads and priority inversion (prioritizing a lower priority thread) can both lead to serious problems.
+An important observation is that threads do not need to have explicit priorities for problems to occur.
+Indeed, any system with multiple ready-queues that attempts to exhaust one queue before accessing the other queues, essentially provide implicit priority, which can encounter starvation problems.
+For example, a popular scheduling strategy that suffers from implicit priorities is work stealing.
+\newterm{Work stealing} is generally presented as follows:
+\begin{enumerate}
+        \item Each processor has a list of ready threads.
+        \item Each processor runs threads from its ready queue first.
+        \item If a processor's ready queue is empty, attempt to run threads from some other processor's ready queue.
+\end{enumerate}
+In a loaded system\footnote{A \newterm{loaded system} is a system where threads are being run at the same rate they are scheduled.}, if a thread does not yield, block, or preempt for an extended period of time, threads on the same processor's list starve if no other processors exhaust their list.
+Since priorities can be complex for programmers to incorporate into their execution intuition, the scheduling strategy proposed for the \CFA runtime does not use a strategy with either implicit or explicit thread priorities.
 \subsection{Schedulers without feedback or priorities}
+This proposal conjectures that is is possible to construct a default scheduler for the \CFA runtime that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about. The simplest fairness guarantee is FIFO ordering, i.e., threads scheduled first run first. However, enforcing FIFO ordering generally conflicts with scalability across multiple processors because of the additionnal synchronization. Thankfully, strict FIFO is not needed for sufficient fairness. Since concurrency is inherently non-deterministic, fairness concerns in scheduling are only a problem if a thread repeatedly runs before another thread can run. This relaxation is possible because the non-determinism means that programmers must already handle ordering problems in order to produce correct code and already must rely on weak guarantees, for example that a specific thread will \emph{eventually} run. Since some reordering does not break correctness, the FIFO fairness guarantee can be significantly relaxed without causing problems. For this proposal, the target guarantee is that the \CFA scheduler provides \emph{probable} FIFO ordering, which allows reordering but makes it improbable that threads are reordered far from their position in total ordering.
+Scheduling is defined as follows :
+This proposal conjectures that is is possible to construct a default scheduler for the \CFA runtime that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about.
+The simplest fairness guarantee is FIFO ordering, \ie threads scheduled first run first.
+However, enforcing FIFO ordering generally conflicts with scalability across multiple processors because of the additional synchronization.
+Thankfully, strict FIFO is not needed for sufficient fairness.
+Since concurrency is inherently non-deterministic, fairness concerns in scheduling are only a problem if a thread repeatedly runs before another thread can run.
+Some relaxation is possible because non-determinism means programmers already handle ordering problems to produce correct code and hence rely on weak guarantees, \eg that a specific thread will \emph{eventually} run.
+Since some reordering does not break correctness, the FIFO fairness guarantee can be significantly relaxed without causing problems.
+For this proposal, the target guarantee is that the \CFA scheduler provides \emph{probable} FIFO ordering, which allows reordering but makes it improbable that threads are reordered far from their position in total ordering.
+The \CFA scheduler fairness is defined as follows:
 \begin{itemize}
         \item Given two threads $X$ and $Y$, the odds that thread $X$ runs $N$ times \emph{after} thread $Y$ is scheduled but \emph{before} it is run, decreases exponentially with regard to $N$.
 \end{itemize}
 While this is not a bounded guarantee, the probability that unfairness persist for long periods of times decreases exponentially, making persisting unfairness virtually impossible.
 % ===============================================================================
 % ===============================================================================
+\section{Proposal}
+\subsection{Ready-Queue} \label{sec:queue}
+A simple ready-queue can be built from a FIFO queue, where user-threads are pushed onto the queue when they are ready to run, and processors (kernel-threads acting as virtual processors) pop the user-threads from the queue and execute them. Using the paper\cite{alistarh2018relaxed} as a basis, it is simple to build a relaxed FIFO list that is fast and scalable for loaded or overloaded systems. The described queue uses an array of underlying strictly FIFO queues as shown in Figure~\ref{fig:base}\footnote{For this section, the number of underlying queues is assumed to be constant. Section~\ref{sec:resize} discusses resizing the array.}. Pushing new data is done by selecting one of these underlying queues at random, recording a timestamp for the operation and pushing to the selected queue. Popping is done by selecting two queues at random and popping from the queue with the oldest timestamp. A higher number of underlying queues leads to less contention on each queue and therefore better performance. In a loaded system, it is highly likely the queues are non-empty, i.e., several tasks are on each of the underlying queues. This means that selecting a queue at random to pop from is highly likely to yield a queue with available items. In Figure~\ref{fig:base}, ignoring the ellipsis, the chances of getting an empty queue is 2/7 per pick, meaning two random picks yield an item approximately 9 times out of 10.
+\section{Proposal Details}
+\subsection{Central Ready Queue} \label{sec:queue}
+A central ready queue can be built from a FIFO queue, where user threads are pushed onto the queue when they are ready to run, and processors (kernel-threads acting as virtual processors) pop the user threads from the queue and execute them.
+Alistarh \etal~\cite{alistarh2018relaxed} show it is straightforward to build a relaxed FIFO list that is fast and scalable for loaded or overloaded systems.
+The described queue uses an array of underlying strictly FIFO queues as shown in Figure~\ref{fig:base}\footnote{For this section, the number of underlying queues is assumed to be constant.
+Section~\ref{sec:resize} discusses resizing the array.}.
+Pushing new data is done by selecting one of these underlying queues at random, recording a timestamp for the operation and pushing to the selected queue.
+Popping is done by selecting two queues at random and popping from the queue with the oldest timestamp.
+A higher number of underlying queues leads to less contention on each queue and therefore better performance.
+In a loaded system, it is highly likely the queues are non-empty, \ie several tasks are on each of the underlying queues.
+This means that selecting a queue at random to pop from is highly likely to yield a queue with available items.
+In Figure~\ref{fig:base}, ignoring the ellipsis, the chances of getting an empty queue is 2/7 per pick, meaning two random picks yield an item approximately 9 times out of 10.
 \begin{figure}
 …
                 \input{base}
         \end{center}
+        \caption{Relaxed FIFO list at the base of the scheduler: an array of strictly FIFO lists. The timestamp is in all nodes and cell arrays.}
+        \caption{Relaxed FIFO list at the base of the scheduler: an array of strictly FIFO lists.
+The timestamp is in all nodes and cell arrays.}
         \label{fig:base}
 \end{figure}
 …
 \end{figure}
+When the ready queue is \emph{more empty}, i.e., several of the queues are empty, selecting a random queue for popping is less likely to yield a valid selection and more attempts need to be made, resulting in a performance degradation. Figure~\ref{fig:empty} shows an example with fewer elements where the chances of getting an empty queue is 5/7 per pick, meaning two random picks yield an item only half the time. Since the ready queue is not empty, the pop operation \emph{must} find an element before returning and therefore must retry. Overall performance is therefore influenced by the contention on the underlying queues and pop performance is influenced by the item density. This leads to four performance cases, as depicted in Table~\ref{tab:perfcases}.
+When the ready queue is \emph{more empty}, \ie several of the queues are empty, selecting a random queue for popping is less likely to yield a successful selection and more attempts are needed, resulting in a performance degradation.
+Figure~\ref{fig:empty} shows an example with fewer elements, where the chances of getting an empty queue is 5/7 per pick, meaning two random picks yield an item only half the time.
+Since the ready queue is not empty, the pop operation \emph{must} find an element before returning and therefore must retry.
+Note, the popping kernel thread has no work to do, but CPU cycles are wasted both for available user and kernel threads during the pop operation as the popping thread is using a CPU.
+Overall performance is therefore influenced by the contention on the underlying queues and pop performance is influenced by the item density.
+This leads to four performance cases for the centralized ready-queue, as depicted in Table~\ref{tab:perfcases}.
+The number of processors (many or few) refers to the number of kernel threads \emph{actively} attempting to pop user threads from the queues, not the total number of kernel threads.
+The number of threads (many or few) refers to the number of user threads ready to be run.
+Many threads means they outnumber processors significantly and most underlying queues have items, few threads mean there are barely more threads than processors and most underlying queues are empty.
+Cases with fewer threads than processors are discussed in Section~\ref{sec:sleep}.
 \begin{table}
 …
                         Many Threads & A: good performance & B: good performance \\
                         \hline
                         Few Threads  & C: poor performance & D: poor performance \\
+                        Few Threads  & C: worst performance & D: poor performance \\
                         \hline
                 \end{tabular}
         \end{center}
         \caption{Performance of the relaxed FIFO list in different cases. The number of processors (many or few) refers to the number of kernel-threads \emph{actively} attempting to pop user-threads from the queues, not the total number of kernel-threads. The number of threads (many or few) refers to the number of user-threads ready to be run. Many threads means they outnumber processors significantly and most underlying queues have items, few threads mean there are barely more threads than processors and most underlying queues are empty. Cases with fewer threads than processors are discussed in Section~\ref{sec:sleep}.}
+        \caption{Expected performance of the relaxed FIFO list in different cases.}
         \label{tab:perfcases}
 \end{table}
+Table~\ref{tab:perfcases}
+Performance can be improved in case~D (Table~\ref{tab:perfcases}) by adding information to help processors find which inner queues are used. This addition aims to avoid the cost of retrying the pop operation but does not affect contention on the underlying queues and can incur some management cost for both push and pop operations. The approach used to encode this information can vary in density and be either global or local, where density means the information is either packed in few cachelines or spread across several cachelines, and local information means each thread uses an independent copy instead of a single global, i.e., common, source of information.
+For example, bitmask can be used to identify which inner queues are currently in use, as shown in Figure~\ref{fig:emptybit}. This means that processors can often find user-threads in constant time, regardless of how many underlying queues are empty. Furthermore, modern x86 CPUs have extended bit manipulation instructions (BMI2) which allow using the bitmask with very little overhead compared to the randomized selection approach for a filled readyqueue, offerring decent performance even in cases with many empty inner queues. However, this technique has its limits: with a single word\footnote{Word refers here to however many bits can be written atomicly.} bitmask, the total number of underlying queues in the ready queue is limited to the number of bits in the word. With a multi-word bitmask, this maximum limit can be increased arbitrarily, but it is not possible to check if the queue is empty by reading the bitmask atomicly.
+Finally, a dense bitmap, either single or multi-word, causes additional problems
+in case C (Table 1), because many processors are continuously scanning the
+bitmask to find the few available threads. This increased contention on the
+bitmask(s) reduces performance because of cache misses and the bitmask is
+updated more frequently by the scanning processors racing to read and/or update
+that information. This increased update frequency means the information in the
+bitmask will more often be stale before a processor can use it to find an item.
+Performance can be improved in case~D (Table~\ref{tab:perfcases}) by adding information to help processors find which inner queues are used.
+This addition aims to avoid the cost of retrying the pop operation but does not affect contention on the underlying queues and can incur some management cost for both push and pop operations.
+The approach used to encode this information can vary in density and be either global or local.
+\newterm{Density} means the information is either packed in a few cachelines or spread across several cachelines, and \newterm{local information} means each thread uses an independent copy instead of a single global, \ie common, source of information.
+For example, Figure~\ref{fig:emptybit} shows a dense bitmask to identify which inner queues are currently in use.
+This approach means processors can often find user threads in constant time, regardless of how many underlying queues are empty.
+Furthermore, modern x86 CPUs have extended bit manipulation instructions (BMI2) that allow using the bitmask with very little overhead compared to the randomized selection approach for a filled ready queue, offering good performance even in cases with many empty inner queues.
+However, this technique has its limits: with a single word\footnote{Word refers here to however many bits can be written atomically.} bitmask, the total number of underlying queues in the ready queue is limited to the number of bits in the word.
+With a multi-word bitmask, this maximum limit can be increased arbitrarily, but it is not possible to check if the queue is empty by reading the bitmask atomically.
+Finally, a dense bitmap, either single or multi-word, causes additional problems in case C (Table 1), because many processors are continuously scanning the bitmask to find the few available threads.
+This increased contention on the bitmask(s) reduces performance because of cache misses after updates and the bitmask is updated more frequently by the scanning processors racing to read and/or update that information.
+This increased update frequency means the information in the bitmask is more often stale before a processor can use it to find an item, \ie mask read says there are available user threads but none on queue.
 \begin{figure}
 …
 \end{figure}
+Another approach is to use a hiearchical data structure, for example Figure~\ref{fig:emptytree}. Creating a tree of nodes to reduce contention has been shown to work in similar cases\cite{ellen2007snzi}\footnote{This particular paper seems to be patented in the US. How does that affect \CFA? Can I use it in my work?}. However, this approach may lead to poorer performance in case~B (Table~\ref{tab:perfcases}) due to the inherent pointer chasing cost and already low contention cost in that case.
+Figure~\ref{fig:emptytree} shows another approach using a hierarchical tree data-structure to reduce contention and has been shown to work in similar cases~\cite{ellen2007snzi}\footnote{This particular paper seems to be patented in the US.
+How does that affect \CFA? Can I use it in my work?}.
+However, this approach may lead to poorer performance in case~B (Table~\ref{tab:perfcases}) due to the inherent pointer chasing cost and already low contention cost in that case.
 \begin{figure}
 …
 \end{figure}
+Finally, a third approach is to use dense information, similar to the bitmap, but have each thread keep its own independant copies of it. While this approach can offer good scalability \emph{and} low latency, the livelyness of the information can become a problem. In the simple cases, local copies of which underlying queues are empty can become stale and end-up not being useful for the pop operation. A more serious problem is that reliable information is necessary for some parts of this algorithm to be correct. As mentionned in this section, processors must know \emph{reliably} whether the list is empty or not to decide if they can return \texttt{NULL} or if they must keep looking during a pop operation. Section~\ref{sec:sleep} discusses another case where reliable information is required for the algorithm to be correct.
+\begin{figure}
+        \begin{center}
+                {\resizebox{0.8\textwidth}{!}{\input{emptytls}}}
+Finally, a third approach is to use dense information, similar to the bitmap, but have each thread keep its own independent copy of it.
+While this approach can offer good scalability \emph{and} low latency, the liveliness of the information can become a problem.
+In the simple cases, local copies of which underlying queues are empty can become stale and end-up not being useful for the pop operation.
+A more serious problem is that reliable information is necessary for some parts of this algorithm to be correct.
+As mentioned in this section, processors must know \emph{reliably} whether the list is empty or not to decide if they can return \texttt{NULL} or if they must keep looking during a pop operation.
+Section~\ref{sec:sleep} discusses another case where reliable information is required for the algorithm to be correct.
+\begin{figure}
+        \begin{center}
+                \input{emptytls}
         \end{center}
         \caption{``More empty'' queue with added per processor bitmask to indicate which array cells have items.}
 …
 \end{figure}
+There is a fundamental tradeoff among these approach. Dense global information about empty underlying queues helps zero-contention cases at the cost of high-contention case. Sparse global information helps high-contention cases but increases latency in zero-contention-cases, to read and ``aggregate'' the information\footnote{Hiearchical structures, e.g., binary search tree, effectively aggregate information but following pointer chains, learning information for each node. Similarly, other sparse schemes would need to read multiple cachelines to acquire all the information needed.}. Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases, however the information can become stale making it difficult to use to ensure correctness. The fact that these solutions have these fundamental limits suggest to me that a better solution combines these properties in an interesting ways. The lock discussed in Section~\ref{sec:resize} also allows for solutions that adapt to the number of processors, which could also prove useful.
+There is a fundamental tradeoff among these approach.
+Dense global information about empty underlying queues helps zero-contention cases at the cost of high-contention case.
+Sparse global information helps high-contention cases but increases latency in zero-contention-cases, to read and ``aggregate'' the information\footnote{Hierarchical structures, \eg binary search tree, effectively aggregate information but follow pointer chains, learning information at each node.
+Similarly, other sparse schemes need to read multiple cachelines to acquire all the information needed.}.
+Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases, however the information can become stale making it difficult to use to ensure correctness.
+The fact that these solutions have these fundamental limits suggest to me a better solution that attempts to combine these properties in an interesting ways.
+Also, the lock discussed in Section~\ref{sec:resize} allows for solutions that adapt to the number of processors, which could also prove useful.
 \paragraph{Objectives and Existing Work}
+How much scalability is actually needed is highly debatable \emph{libfibre}\cit has compared favorably to other schedulers in webserver tests\cit and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask. As such, the single atomic instruction on a shared cacheline may be sufficiently performant.
+I have built a prototype of this ready-queue in the shape of a data-queue, i.e., nodes on the queue are structures with a single int and the intrusive data fields. Using this prototype I ran preliminary performance experiments which confirm the expected performance in Table~\ref{tab:perfcases}. However, these experiments only offer a hint at the actual performance of the scheduler since threads form more complex operations than simple integer nodes, e.g., threads are not independant of each other, when a thread blocks some other thread must intervene to wake it.
+I have also integrated this prototype into the \CFA runtime, but have not yet created performance experiments to compare results. As creating one-to-one comparisons with the prototype will be complex.
+How much scalability is actually needed is highly debatable.
+\emph{libfibre}\cite{libfibre} has compared favorably to other schedulers in webserver tests\cite{karstenuser} and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask.
+As such, the single atomic instruction on a shared cacheline may be sufficiently performant.
+I have built a prototype of this ready queue in the shape of a data queue, \ie nodes on the queue are structures with a single int representing a thread and intrusive data fields.
+Using this prototype I ran preliminary performance experiments that confirm the expected performance in Table~\ref{tab:perfcases}.
+However, these experiments only offer a hint at the actual performance of the scheduler since threads form more complex operations than simple integer nodes, \eg threads are not independent of each other, when a thread blocks some other thread must intervene to wake it.
+I have also integrated this prototype into the \CFA runtime, but have not yet created performance experiments to compare results, as creating one-to-one comparisons between the prototype and the \CFA runtime will be complex.
 \subsection{Dynamic Resizing} \label{sec:resize}
 \begin{figure}
         \begin{center}
 …
 \end{figure}
+The \CFA runtime system groups processors together as clusters. Threads on a cluster are always scheduled on one of the processors of the cluster. Currently, the runtime handles dynamically adding and removing processors from clusters at any time. Since this is part of the existing design, the proposed scheduler must also support this behaviour. However, dynamicaly resizing a cluster is considered a rare event associated with setup, teardown and major configuration changes. This assumption is made both in the design of the proposed scheduler as well as in the original design of the \CFA runtime system. As such, the proposed scheduler must honor the correctness of these behaviour but does not have any performance objectives with regard to resizing a cluster. How long adding or removing processors take and how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long period of times. However, as mentionned in Section~\ref{sec:queue}, contention on the underlying queues can have a direct impact on performance. The number of underlying queues must therefore be adjusted as the number of processors grows or shrinks. Since the underlying queues are stored in a dense array, changing the number of queues requires resizing the array and expanding the array requires moving it. This can introduce memory reclamation problems if not done correctly.
+The \CFA runtime system groups processors together as \newterm{clusters}, as shown in Figure~\ref{fig:system}.
+Threads on a cluster are always scheduled on one of the processors of the cluster.
+Currently, the runtime handles dynamically adding and removing processors from clusters at any time.
+Since this is part of the existing design, the proposed scheduler must also support this behaviour.
+However, dynamically resizing a cluster is considered a rare event associated with setup, tear down and major configuration changes.
+This assumption is made both in the design of the proposed scheduler as well as in the original design of the \CFA runtime system.
+As such, the proposed scheduler must honour the correctness of this behaviour but does not have any performance objectives with regard to resizing a cluster.
+How long adding or removing processors take and how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long period of times.
+However, as mentioned in Section~\ref{sec:queue}, contention on the underlying queues can have a direct impact on performance.
+The number of underlying queues must therefore be adjusted as the number of processors grows or shrinks.
+Since the underlying queues are stored in a dense array, changing the number of queues requires resizing the array and expanding the array requires moving it, which can introduce memory reclamation problems if not done correctly.
 \begin{figure}
 …
                 \input{resize}
         \end{center}
         \caption{Copy of data structure shown in Figure~\ref{fig:base}. }
+        \caption{Copy of data structure shown in Figure~\ref{fig:base}.}
         \label{fig:base2}
 \end{figure}
+It is important to note how the array is used in this case. While the array cells are modified by every push and pop operation, the array itself, i.e., the pointer that would change when resized, is only read during these operations. Therefore the use of this pointer can be described as frequent reads and infrequent writes. This description effectively matches with the description of a Reader-Writer lock, infrequent but invasive updates among frequent read operations. In the case of the Ready-Queue described above, read operations are operations that push or pop from the ready-queue but do not invalidate any references to the ready queue data structures. Writes on the other-hand would add or remove inner queues, invalidating references to the array of inner queues in the process. Therefore, the current proposed approach to this problem is to add a per-cluster Reader Writer lock around the ready queue to prevent restructuring of the ready-queue data structure while threads are being pushed or popped.
+There are possible alternatives to the Reader Writer lock solution. This problem is effectively a memory reclamation problem and as such there is a large body of research on the subject\cit. However, the RWlock solution is simple and can be leveraged to solve other problems (e.g., processor ordering and memory reclamation of threads), which makes it an attractive solution.
+It is important to note how the array is used in this case.
+While the array cells are modified by every push and pop operation, the array itself, \ie the pointer that would change when resized, is only read during these operations.
+Therefore the use of this pointer can be described as frequent reads and infrequent writes.
+This description effectively matches with the description of a reader-writer lock, infrequent but invasive updates among frequent read operations.
+In the case of the ready queue described above, read operations are operations that push or pop from the ready queue but do not invalidate any references to the ready queue data structures.
+Writes on the other hand would add or remove inner queues, invalidating references to the array of inner queues in a process.
+Therefore, the current proposed approach to this problem is to add a per-cluster reader-writer lock around the ready queue to prevent restructuring of the ready-queue data-structure while threads are being pushed or popped.
+There are possible alternatives to the reader-writer lock solution.
+This problem is effectively a memory reclamation problem and as such there is a large body of research on the subject\cite{michael2004hazard, brown2015reclaiming}.
+However, the reader-write lock-solution is simple and can be leveraged to solve other problems (\eg processor ordering and memory reclamation of threads), which makes it an attractive solution.
 \paragraph{Objectives and Existing Work}
+The lock must offer scalability and performance on par with the actual ready-queue in order not to introduce a new bottleneck. I have already built a lock that fits the desired requirements and preliminary testing show scalability and performance that exceed the target. As such, I do not consider this lock to be a risk on this project.
+The lock must offer scalability and performance on par with the actual ready-queue in order not to introduce a new bottleneck.
+I have already built a lock that fits the desired requirements and preliminary testing show scalability and performance that exceed the target.
+As such, I do not consider this lock to be a risk for this project.
 \subsection{Idle Sleep} \label{sec:sleep}
+As mentioned, idle sleep is the process of putting processors to sleep when they have no threads to execute. In this context, processors are kernel-threads and sleeping refers to asking the kernel to block a thread. This benefit can be achieved with either thread synchronization operations like pthread\_cond\_wait or using signal operations like sigsuspend. The goal of putting idle processors to sleep is two-fold, it reduces energy consumption in cases where more idle kernel-threads translate to idle hardware threads, and reduces contention on the ready queue, since the otherwise idle processors generally contend trying to pop items from the queue.
+Support for idle sleep broadly involves calling the operating system to block the kernel thread and handling the race between a blocking thread  and the waking thread, and handling which kernel thread should sleep or wake-up.
+When a processor decides to sleep, there is a race that occurs between it signalling that is going to sleep (so other processors can find sleeping processors) and actually blocking the kernel thread. This is equivalent to the classic problem of missing signals when using condition variables: the ``sleepy'' processor indicates that it will sleep but has not yet gone to sleep, when another processor attempts to wake it up, the waking-up operation may claim nothing needs to be done and the signal is missed. In cases where threads are scheduled from processors on the current cluster, loosing signals is not necessarily critical, because at least some processors on the cluster are awake and may check for more processors eventually. Individual processors always finish scheduling threads before looking for new work, which means that the last processor to go to sleep cannot miss threads scheduled from inside the cluster (if they do, that demonstrates the ready-queue is not linearizable). However, this guarantee does not hold if threads are scheduled from outside the cluster, either due to an external event like timers and I/O, or due to a thread migrating from a different cluster. In this case, missed signals can lead to the cluster deadlocking where it should not\footnote{Clusters ``should'' never deadlock, but for this proposal, cases where \CFA users \emph{actually} write \CFA code that leads to a deadlock is considered as a deadlock that ``should'' happen. }. Therefore, it is important that the scheduling of threads include a mechanism where signals \emph{cannot} be missed. For performance reasons, it can be advantageous to have a secondary mechanism that allows signals to be missed in cases where it cannot lead to a deadlock. To be safe, this process must include a ``handshake'' where it is guaranteed that either~: the sleepy processor notices that a thread is scheduled after it signalled its intent to block or code scheduling threads sees the intent to sleep before scheduling and be able to wake-up the processor. This matter is complicated by the fact that pthreads offers few tools to implement this solution and offers no guarantee of ordering of threads waking up for most of these tools.
+Another issues is trying to avoid kernel threads sleeping and waking frequently. A possible partial solution is to order the processors so that the one which most recently went to sleep is woken up. This allows other sleeping processors to reach deeper sleep state (when these are available) while keeping ``hot'' processors warmer. Note that while this generally means organising the processors in a stack, I believe that the unique index provided by the ReaderWriter lock can be reused to strictly order the waking order of processors, causing a LIFO like waking order. While a strict LIFO stack is probably better, using the processor index could prove useful and offer a sufficiently LIFO ordering.
+Finally, another important aspect of Idle Sleep is when should processors make the decision to sleep and when is it appropriate for sleeping processors to be woken up. Processors that are unnecessarily unblocked lead to unnecessary contention and power consumption, while too many sleeping processors can lead to sub-optimal throughput. Furthermore, transitions from sleeping to awake and vice-versa also add unnecessary latency. There is already a wealth of research on the subject and I may use an existing approach for the Idle Sleep heuristic in this project.
+\newterm{Idle sleep} is the process of putting processors to sleep when they have no threads to execute.
+In this context, processors are kernel threads and sleeping refers to asking the kernel to block a thread.
+This operation can be achieved with either thread synchronization operations like $pthread_cond_wait$ or using signal operations like $sigsuspend$.
+The goal of putting idle processors to sleep is:
+\begin{enumerate}
+\item
+reduce contention on the ready queue, since the otherwise idle processors generally contend trying to pop items from the queue,
+\item
+give back unneeded CPU time associated with a process to other user processors executing on the computer,
+\item
+and reduce energy consumption in cases where more idle kernel-threads translate to idle CPUs, which can cycle down.
+\end{enumerate}
+Support for idle sleep broadly involves calling the operating system to block the kernel thread and handling the race between a blocking thread and the waking thread, and handling which kernel thread should sleep or wake up.
+When a processor decides to sleep, there is a race that occurs between it signalling that is going to sleep (so other processors can find sleeping processors) and actually blocking the kernel thread.
+This operation is equivalent to the classic problem of missing signals when using condition variables: the ``sleepy'' processor indicates its intention to block but has not yet gone to sleep when another processor attempts to wake it up.
+The waking-up operation sees the blocked process and signals it, but the blocking process is racing to sleep so the signal is missed.
+In cases where kernel threads are managed as processors on the current cluster, loosing signals is not necessarily critical, because at least some processors on the cluster are awake and may check for more processors eventually.
+Individual processors always finish scheduling user threads before looking for new work, which means that the last processor to go to sleep cannot miss threads scheduled from inside the cluster (if they do, that demonstrates the ready queue is not linearizable).
+However, this guarantee does not hold if threads are scheduled from outside the cluster, either due to an external event like timers and I/O, or due to a user (or kernel) thread migrating from a different cluster.
+In this case, missed signals can lead to the cluster deadlocking\footnote{Clusters should only deadlock in cases where a \CFA programmer \emph{actually} write \CFA code that leads to a deadlock.}.
+Therefore, it is important that the scheduling of threads include a mechanism where signals \emph{cannot} be missed.
+For performance reasons, it can be advantageous to have a secondary mechanism that allows signals to be missed in cases where it cannot lead to a deadlock.
+To be safe, this process must include a ``handshake'' where it is guaranteed that either~: the sleeping processor notices that a user thread is scheduled after the sleeping processor signalled its intent to block or code scheduling threads sees the intent to sleep before scheduling and be able to wake-up the processor.
+This matter is complicated by the fact that pthreads and Linux offer few tools to implement this solution and no guarantee of ordering of threads waking up for most of these tools.
+Another important issue is avoiding kernel threads sleeping and waking frequently because there is a significant operating-system cost.
+This scenario happens when a program oscillates between high and low activity, needing most and then less processors.
+A possible partial solution is to order the processors so that the one which most recently went to sleep is woken up.
+This allows other sleeping processors to reach deeper sleep state (when these are available) while keeping ``hot'' processors warmer.
+Note that while this generally means organizing the processors in a stack, I believe that the unique index provided in my reader-writer lock can be reused to strictly order the waking processors, causing a mostly LIFO order.
+While a strict LIFO stack is probably better, the processor index could prove useful for other reasons, while still offering a sufficiently LIFO ordering.
+A final important aspect of idle sleep is when should processors make the decision to sleep and when is it appropriate for sleeping processors to be woken up.
+Processors that are unnecessarily unblocked lead to unnecessary contention, CPU usage, and power consumption, while too many sleeping processors can lead to sub-optimal throughput.
+Furthermore, transitions from sleeping to awake and vice-versa also add unnecessary latency.
+There is already a wealth of research on the subject\cite{schillings1996engineering, wiki:thunderherd} and I may use an existing approach for the idle-sleep heuristic in this project, \eg\cite{karstenuser}.
 \subsection{Asynchronous I/O}
+The final aspect of this proposal is asynchronous I/O. Without it, user threads that execute I/O operations block the underlying kernel thread, which leads to poor throughput. It would be preferrable to block the user-thread and reuse the underlying kernel-thread to run other ready threads. This approach requires intercepting the user-threads' calls to I/O operations, redirecting them to an asynchronous I/O interface and handling the multiplexing between the synchronous and asynchronous API. As such, these are the three components needed to implemented support for asynchronous I/O : an OS abstraction layer over the asynchronous interface, an event-engine to (de)multiplex the operations and a synchronous interface for users to use. None of these components currently exist in \CFA and I will need to build all three for this project.
+The final aspect of this proposal is asynchronous I/O.
+Without it, user threads that execute I/O operations block the underlying kernel thread, which leads to poor throughput.
+It is preferable to block the user thread performing the I/O and reuse the underlying kernel-thread to run other ready user threads.
+This approach requires intercepting user-thread calls to I/O operations, redirecting them to an asynchronous I/O interface, and handling the multiplexing/demultiplexing between the synchronous and asynchronous API.
+As such, there are three components needed to implemented support for asynchronous I/O:
+\begin{enumerate}
+\item
+an OS abstraction layer over the asynchronous interface,
+\item
+an event-engine to (de)multiplex the operations,
+\item
+and a synchronous interface for users to use.
+\end{enumerate}
+None of these components currently exist in \CFA and I will need to build all three for this project.
 \paragraph{OS Abstraction}
+One fundamental part for converting blocking I/O operations into non-blocking ones is having an underlying asynchronous I/O interface to direct the I/O operations. While there exists many different APIs for asynchronous I/O, it is not part of this proposal to create a novel API. I plan to use an existing one that is sufficient. uC++ uses the \texttt{select} as its interface, which handles ttys, pipes and sockets, but not disk. It entails significant complexity and is being replaced, which make it a less interesting alternative. Another popular interface is \texttt{epoll}\cit, which is supposed to be cheaper than \texttt{select}. However, epoll also does not handle the file system and seems to have problem to linux pipes and \texttt{TTY}s\cit. A very recent alternative that must still be investigated is \texttt{io\_uring}. It claims to address some of the issues with \texttt{epoll} but is too recent to be confident that it does. Finally, a popular cross-platform alternative is \texttt{libuv}, which offers asynchronous sockets and asynchronous file system operations (among other features). However, as a full-featured library it includes much more than what is needed and could conflict with other features of \CFA unless significant effort is made to merge them together.
+\paragraph{Event-Engine}
+Laying on top of the asynchronous interface layer is the event-engine. This engine is responsible for multiplexing (batching) the synchronous I/O requests into an asynchronous I/O request and demultiplexing the results onto appropriate blocked threads. This step can be straightforward for the simple cases, but can become quite complex. Decisions that need to be made include : whether to poll from a seperate kernel thread or a regularly scheduled user thread, what should be the ordering used when results satisfy many requests, how to handle threads waiting for multiple operations, etc.
+One fundamental part for converting blocking I/O operations into non-blocking ones is having an underlying asynchronous I/O interface to direct the I/O operations.
+While there exists many different APIs for asynchronous I/O, it is not part of this proposal to create a novel API.
+It is sufficient to make one work in the complex context of the \CFA runtime.
+\uC uses the $select$\cite{select} as its interface, which handles ttys, pipes and sockets, but not disk.
+$select$ entails significant complexity and is being replaced in UNIX operating-systems, which make it a less interesting alternative.
+Another popular interface is $epoll$\cite{epoll}, which is supposed to be cheaper than $select$.
+However, $epoll$ also does not handle the file system and anectodal evidence suggest it has problem with linux pipes and $TTY$s.
+A popular cross-platform alternative is $libuv$\cite{libuv}, which offers asynchronous sockets and asynchronous file system operations (among other features).
+However, as a full-featured library it includes much more than I need and could conflict with other features of \CFA unless significant effort is made to merge them together.
+A very recent alternative that I am investigating is $io_uring$\cite{io_uring}.
+It claims to address some of the issues with $epoll$ and my early investigating suggest that the claim is accurate.
+$io_uring$ uses a much more general approach where system calls are register to a queue and later executed by the kernel, rather than relying on system calls to return an error instead of blocking and subsequently waiting for changes on file descriptors.
+I believe this approach allows for fewer problems, \eg the manpage for $open$\cite{open} states:
+\begin{quote}
+        Note that [the $O_NONBLOCK$ flag] has no effect for regular files and block devices;
+        that is, I/O operations will (briefly) block when device activity is required, regardless of whether $O_NONBLOCK$ is set.
+        Since $O_NONBLOCK$ semantics might eventually be implemented, applications should not depend upon blocking behavior when specifying this flag for regular files and block devices.
+\end{quote}
+This makes approach based on $epoll$/$select$ less reliable since they may not work for every file descriptors.
+For this reason, I plan to use $io_uring$ as the OS abstraction for the \CFA runtime, unless further work shows problems I haven't encountered yet.
+However, only a small subset of the features are available in Ubuntu as of April 2020\cite{wiki:ubuntu-linux}, which will limit performance comparisons.
+I do not believe this will affect the comparison result.
+\paragraph{Event Engine}
+Laying on top of the asynchronous interface layer is the event engine.
+This engine is responsible for multiplexing (batching) the synchronous I/O requests into asynchronous I/O requests and demultiplexing the results to appropriate blocked user threads.
+This step can be straightforward for simple cases, but becomes quite complex when there are thousands of user threads performing both reads and writes, possibly on overlapping file descriptors.
+Decisions that need to be made include:
+\begin{enumerate}
+\item
+whether to poll from a separate kernel thread or a regularly scheduled user thread,
+\item
+what should be the ordering used when results satisfy many requests,
+\item
+how to handle threads waiting for multiple operations, etc.
+\end{enumerate}
 \paragraph{Interface}
+Finally, for these components to be available, it is necessary to expose them through a synchronous interface. The interface can be novel but it is preferrable to match the existing POSIX interface in order to be compatible with existing code. Matching allows C programs written using this interface to be transparently converted to \CFA with minimal effeort. Where this is not applicable, a novel interface will be created to fill the gaps.
+Finally, for these non-blocking I/O components to be available, it is necessary to expose them through a synchronous interface because that is the \CFA concurrent programming style.
+The interface can be novel but it is preferable to match the existing POSIX interface when possible to be compatible with existing code.
+Matching allows C programs written using this interface to be transparently converted to \CFA with minimal effort.
+Where new functionality is needed, I will create a novel interface to fill gaps and provide advanced features.
 …
 % ===============================================================================
 \section{Discussion}
+I believe that runtime system and scheduling are still open topics.
+Many ``state of the art'' production frameworks still use single threaded event-loops because of performance considerations, \eg \cite{nginx-design}, and, to my knowledge, no wideyl available system language offers modern threading facilities.
+I believe the proposed work offers a novel runtime and scheduling package, where existing work only offers fragments that users must assemble themselves when possible.
 % ===============================================================================
 % ===============================================================================
 \section{Timeline}
+\begin{center}
+\begin{tabular}{ | r @{--} l | p{4in} | }
+\hline May 2020 & October 2020   & Creation of the performance benchmark. \\
+\hline November 2020 & March 2021   & Completion of the implementation. \\
+\hline March 2021 & April 2021  & Final Performance experiments. \\
+\hline May 2021 & August 2021 & Thesis writing and defense. \\
+\hline
+\end{tabular}
+\end{center}
 % B I B L I O G R A P H Y

doc/theses/thierry_delisle_PhD/comp_II/img/base.fig

-              re3bc51c
+              rbcd74f3
 #FIG 3.2  Produced by xfig version 3.2.7a
+#FIG 3.2  Produced by xfig version 3.2.5c
 Landscape
 Center
 Inches
 Letter
+Letter
 .00
 Single
 -2
 2
-2400 3075 3000 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 2850 3075 2550 3075 2400 3335 2550 3595 2850 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 2700 3600
--6
-2400 2175 3000 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 2850 2175 2550 2175 2400 2435 2550 2695 2850 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-3375 2700 2700
--6
-3600 2175 4200 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 4050 2175 3750 2175 3600 2435 3750 2695 4050 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-3375 3900 2700
--6
-3600 3075 4200 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 4050 3075 3750 3075 3600 3335 3750 3595 4050 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 3900 3600
--6
-4200 3075 4800 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 4650 3075 4350 3075 4200 3335 4350 3595 4650 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 4500 3600
--6
-4800 3075 5400 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 5250 3075 4950 3075 4800 3335 4950 3595 5250 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 5100 3600
--6
-4800 2175 5400 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 5250 2175 4950 2175 4800 2435 4950 2695 5250 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-3375 5100 2700
--6
-4800 1275 5400 2475
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-1535 5250 1275 4950 1275 4800 1535 4950 1795 5250 1795
-1535
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-2475 5100 1800
--6
-6000 2175 6600 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 6450 2175 6150 2175 6000 2435 6150 2695 6450 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-3375 6300 2700
--6
-6000 3075 6600 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 6450 3075 6150 3075 6000 3335 6150 3595 6450 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 6300 3600
--6
 6750 4125 7050 4275
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6825 4200 20 20 6825 4200 6845 4200
 …
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
 -6
+2400 2100 3000 2700
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 2400 300 300 2700 2400 3000 2400
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+2475 3000 2475
+1 0 50 -1 0 11 0.0000 2 120 210 2700 2650 TS\001
+-6
+2400 3000 3000 3600
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 300 300 2700 3300 3000 3300
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3375 3000 3375
+1 0 50 -1 0 11 0.0000 2 120 210 2700 3550 TS\001
+-6
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 2400 300 300 3900 2400 4200 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 3300 300 300 3900 3300 4200 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 1500 300 300 5100 1500 5400 1500
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 2400 300 300 5100 2400 5400 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 3300 300 300 5100 3300 5400 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 2400 300 300 6300 2400 6600 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 3300 300 300 6300 3300 6600 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4509 3302 300 300 4509 3302 4809 3302
 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 3900 3000 4500
 …
 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 3900 7200 3900 7200 4500 2400 4500 2400 3900
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 2700 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 3900 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 3900 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+2475 5100 1800
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 5100 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 5100 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 6300 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 6300 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 2700 3600
 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+2520 2957 2520
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3417 2957 3417
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+4350 3000 4350
+2 0 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001
+2 0 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001
+2 0 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001
+2 0 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001
+0 0 50 -1 0 11 0.0000 2 135 180 2595 3561 TS\001
+0 0 50 -1 0 11 0.0000 2 135 180 2595 2665 TS\001
+0 0 50 -1 0 11 0.0000 2 135 180 2595 4479 TS\001
+4275 3000 4275
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 4500 3600
+2 0 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
+2 0 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
+2 0 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
+2 0 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001
+1 0 50 -1 0 11 0.0000 2 120 210 2700 4450 TS\001

doc/theses/thierry_delisle_PhD/comp_II/img/empty.fig

-              re3bc51c
+              rbcd74f3
 -2
 2
-4800 3075 5400 4200
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 5100 3600
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 5250 3075 4950 3075 4800 3335 4950 3595 5250 3595
-3335
--6
-2400 2175 3000 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 2850 2175 2550 2175 2400 2435 2550 2695 2850 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-3375 2700 2700
--6
-2400 3075 3000 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 2850 3075 2550 3075 2400 3335 2550 3595 2850 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 2700 3600
--6
 6750 4125 7050 4275
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6825 4200 20 20 6825 4200 6845 4200
 …
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
 -6
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 3300 300 300 5100 3300 5400 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 2400 300 300 2700 2400 3000 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 300 300 2700 3300 3000 3300
 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 3900 3000 4500
 …
 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 3900 7200 3900 7200 4500 2400 4500 2400 3900
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 2700 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 5100 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 2700 3600
 2 0 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
 2 0 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001

doc/theses/thierry_delisle_PhD/comp_II/img/emptybit.fig

-              re3bc51c
+              rbcd74f3
 -2
 2
-4800 3075 5400 4200
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 5100 3600
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 5250 3075 4950 3075 4800 3335 4950 3595 5250 3595
-3335
--6
-2400 2175 3000 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 2850 2175 2550 2175 2400 2435 2550 2695 2850 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-3375 2700 2700
--6
-2400 3075 3000 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 2850 3075 2550 3075 2400 3335 2550 3595 2850 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 2700 3600
--6
 6750 4125 7050 4275
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6825 4200 20 20 6825 4200 6845 4200
 …
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
 -6
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 3300 300 300 5100 3300 5400 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 2400 300 300 2700 2400 3000 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 300 300 2700 3300 3000 3300
 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 3900 3000 4500
 …
 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 3900 7200 3900 7200 4500 2400 4500 2400 3900
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 2700 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 5100 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 2700 3600
+0 0 50 -1 5 14 0.0000 2 180 1800 3750 4800 [1000100...]\001
 2 0 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
 2 0 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
 2 0 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
 2 0 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001
-0 0 50 -1 5 14 0.0000 2 180 1800 3750 4800 [1000100...]\001

doc/theses/thierry_delisle_PhD/comp_II/img/emptytls.fig

-              re3bc51c
+              rbcd74f3
 #FIG 3.2  Produced by xfig version 3.2.7a
+#FIG 3.2  Produced by xfig version 3.2.5c
 Landscape
 Center
 Inches
 Letter
+Letter
 .00
 Single
 -2
 2
-4800 3075 5400 4200
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 5100 3600
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 5250 3075 4950 3075 4800 3335 4950 3595 5250 3595
-3335
--6
-2400 2175 3000 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 2850 2175 2550 2175 2400 2435 2550 2695 2850 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-3375 2700 2700
--6
-2400 3075 3000 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 2850 3075 2550 3075 2400 3335 2550 3595 2850 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 2700 3600
--6
 6750 4125 7050 4275
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6825 4200 20 20 6825 4200 6845 4200
 …
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
 -6
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 3300 300 300 5100 3300 5400 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 2400 300 300 2700 2400 3000 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 300 300 2700 3300 3000 3300
 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 3900 3000 4500
 …
 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 3900 7200 3900 7200 4500 2400 4500 2400 3900
+2 0 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001
+2 0 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001
+2 0 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001
+2 0 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001
+0 0 50 -1 5 14 0.0000 2 165 1080 2400 5100 [1000100...]\001
+0 0 50 -1 5 14 0.0000 2 165 1080 4425 5100 [1000100...]\001
+0 0 50 -1 5 14 0.0000 2 165 1080 6450 5100 [1000100...]\001
+0 0 50 -1 0 13 0.0000 2 135 630 1500 5175 Bitmask\001
+0 0 50 -1 0 13 0.0000 2 135 1080 1050 4950 Thread-Local\001
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 2700 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 5100 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 2700 3600
+0 0 50 -1 5 14 0.0000 2 180 1800 2400 5100 [1000100...]\001
+0 0 50 -1 5 14 0.0000 2 180 1800 4425 5100 [1000100...]\001
+0 0 50 -1 5 14 0.0000 2 180 1800 6450 5100 [1000100...]\001
+0 0 50 -1 0 13 0.0000 2 135 690 1500 5175 Bitmask\001
+0 0 50 -1 0 13 0.0000 2 150 1155 1050 4950 Thread-Local\001
+2 0 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
+2 0 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
+2 0 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
+2 0 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001

doc/theses/thierry_delisle_PhD/comp_II/img/emptytree.fig

-              re3bc51c
+              rbcd74f3
 -2
 2
-4800 3075 5400 4200
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 5100 3600
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 5250 3075 4950 3075 4800 3335 4950 3595 5250 3595
-3335
--6
-2400 2175 3000 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 2850 2175 2550 2175 2400 2435 2550 2695 2850 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-3375 2700 2700
--6
-2400 3075 3000 4200
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-3335 2850 3075 2550 3075 2400 3335 2550 3595 2850 3595
-3335
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
-1 1.00 45.00 90.00
-4200 2700 3600
--6
 6750 4125 7050 4275
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6825 4200 20 20 6825 4200 6845 4200
 …
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
 -6
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 3000 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 3600 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 4200 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 4800 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 5400 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 6000 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 6600 4500
+2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+3900 7200 3900 7200 4500 2400 4500 2400 3900
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 3300 300 300 5100 3300 5400 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 2400 300 300 2700 2400 3000 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 300 300 2700 3300 3000 3300
 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
 …
 1 1.00 45.00 90.00
 5400 6300 5100
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 3000 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 3600 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 4200 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 4800 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 5400 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 6000 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+3900 6600 4500
+2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+3900 7200 3900 7200 4500 2400 4500 2400 3900
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 2700 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 5100 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 2700 3600
+1 0 50 -1 0 12 0.0000 2 135 135 3300 4725 X\001
+1 0 50 -1 0 12 0.0000 2 135 135 3900 5025 X\001
+1 0 50 -1 0 12 0.0000 2 135 135 5700 4725 X\001
+1 0 50 -1 0 12 0.0000 2 135 135 6300 5025 X\001
 2 0 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
 2 0 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
 2 0 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
 2 0 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001
-1 0 50 -1 0 12 0.0000 2 135 135 3300 4725 X\001
-1 0 50 -1 0 12 0.0000 2 135 135 3900 5025 X\001
-1 0 50 -1 0 12 0.0000 2 135 135 5700 4725 X\001
-1 0 50 -1 0 12 0.0000 2 135 135 6300 5025 X\001

doc/theses/thierry_delisle_PhD/comp_II/img/resize.fig

-              re3bc51c
+              rbcd74f3
 #FIG 3.2  Produced by xfig version 3.2.7a
+#FIG 3.2  Produced by xfig version 3.2.5c
 Landscape
 Center
 Inches
 Letter
+Letter
 .00
 Single
 -2
 2
+2400 3075 3000 4200
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+3335 2850 3075 2550 3075 2400 3335 2550 3595 2850 3595
+3335
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+7500 3675 8475 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
 1 1.00 45.00 90.00
-4200 2700 3600
--6
-2400 2175 3000 3375
-3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
-2435 2850 2175 2550 2175 2400 2435 2550 2695 2850 2695
-2435
-1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
 1 1.00 45.00 90.00
+3375 2700 2700
+-6
+3600 2175 4200 3375
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+2435 4050 2175 3750 2175 3600 2435 3750 2695 4050 2695
+2435
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3375 3900 2700
+-6
+3600 3075 4200 4200
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+3335 4050 3075 3750 3075 3600 3335 3750 3595 4050 3595
+3335
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 3900 3600
+-6
+4200 3075 4800 4200
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+3335 4650 3075 4350 3075 4200 3335 4350 3595 4650 3595
+3335
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 4500 3600
+-6
+4800 3075 5400 4200
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+3335 5250 3075 4950 3075 4800 3335 4950 3595 5250 3595
+3335
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 5100 3600
+-6
+4800 2175 5400 3375
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+2435 5250 2175 4950 2175 4800 2435 4950 2695 5250 2695
+2435
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3375 5100 2700
+-6
+4800 1275 5400 2475
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+1535 5250 1275 4950 1275 4800 1535 4950 1795 5250 1795
+1535
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+2475 5100 1800
+-6
+6000 2175 6600 3375
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+2435 6450 2175 6150 2175 6000 2435 6150 2695 6450 2695
+2435
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3375 6300 2700
+-6
+6000 3075 6600 4200
+3 0 1 0 7 50 -1 -1 0.000 0 0 0 0 0 7
+3335 6450 3075 6150 3075 6000 3335 6150 3595 6450 3595
+3335
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 6300 3600
+4200 7950 4200
+0 0 50 -1 0 12 0.0000 2 135 915 7500 3825 Grows with\001
+0 0 50 -1 0 12 0.0000 2 135 840 7500 4050 additional\001
+0 0 50 -1 0 12 0.0000 2 135 840 7500 4425 processors\001
 -6
 6750 4125 7050 4275
 …
 3 0 1 0 0 50 -1 20 0.000 1 0.0000 6975 4200 20 20 6975 4200 6995 4200
 -6
+7500 3675 8475 4500
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
+1 1.00 45.00 90.00
+1 1.00 45.00 90.00
+4200 7950 4200
+0 0 50 -1 0 12 0.0000 2 135 900 7500 3825 Grows with\001
+0 0 50 -1 0 12 0.0000 2 135 900 7500 4050 additional\001
+0 0 50 -1 0 12 0.0000 2 120 900 7500 4425 processors\001
+-6
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 2400 300 300 3900 2400 4200 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3900 3300 300 300 3900 3300 4200 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 1500 300 300 5100 1500 5400 1500
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 2400 300 300 5100 2400 5400 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5100 3300 300 300 5100 3300 5400 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 2400 300 300 6300 2400 6600 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6300 3300 300 300 6300 3300 6600 3300
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4509 3302 300 300 4509 3302 4809 3302
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 2400 300 300 2700 2400 3000 2400
+3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 300 300 2700 3300 3000 3300
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 3
+1 1.00 60.00 120.00
+5550 2400 5550 2400 4500
+2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+5100 4500 5100 4500 6000 3600 6000 3600 5100
 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
 3900 3000 4500
 …
 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
 3900 7200 3900 7200 4500 2400 4500 2400 3900
+2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+5100 4500 5100 4500 6300 3600 6300 3600 5100
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 3
+1 1.00 60.00 120.00
+5550 2400 5550 2400 4500
+2 0 50 -1 0 12 0.0000 2 165 720 2100 4200 Array of\001
+2 0 50 -1 0 12 0.0000 2 150 540 2100 4425 Queues\001
+2 0 50 -1 0 12 0.0000 2 135 630 2100 3075 Threads\001
+2 0 50 -1 0 12 0.0000 2 165 450 2100 2850 Ready\001
+0 0 50 -1 0 13 0.0000 2 135 1980 3600 5025 Cluster Data Strcuture\001
+0 0 50 -1 0 13 0.0000 2 165 1170 2400 5775 Array Pointer\001
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 2700 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 3900 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 3900 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+2475 5100 1800
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 5100 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 5100 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+3300 6300 2700
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 6300 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 2700 3600
+1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2
+1 1.00 45.00 90.00
+4200 4500 3600
+0 0 50 -1 0 13 0.0000 2 180 1170 2400 5775 Array Pointer\001
+2 0 50 -1 0 12 0.0000 2 180 660 2100 4200 Array of\001
+2 0 50 -1 0 12 0.0000 2 165 600 2100 4425 Queues\001
+2 0 50 -1 0 12 0.0000 2 135 645 2100 3075 Threads\001
+2 0 50 -1 0 12 0.0000 2 180 525 2100 2850 Ready\001
+1 0 50 -1 0 13 0.0000 2 150 1890 4050 5025 Cluster Data Structure\001

doc/theses/thierry_delisle_PhD/comp_II/local.bib

-              re3bc51c
+              rbcd74f3
 @article{finkel1987dib,
   title={DIB—a distributed implementation of backtracking},
+  title={DIB-a distributed implementation of backtracking},
   author={Finkel, Raphael and Manber, Udi},
   journal={ACM Transactions on Programming Languages and Systems (TOPLAS)},
 …
 % ===============================================================================
+% MISC
+% ===============================================================================
+% Algorithms
+% ===============================================================================
+@article{michael2004hazard,
+  title={Hazard pointers: Safe memory reclamation for lock-free objects},
+  author={Michael, Maged M},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  volume={15},
+  number={6},
+  pages={491--504},
+  year={2004},
+  publisher={IEEE}
+}
+@inproceedings{brown2015reclaiming,
+  title={Reclaiming memory for lock-free data structures: There has to be a better way},
+  author={Brown, Trevor Alexander},
+  booktitle={Proceedings of the 2015 ACM Symposium on Principles of Distributed Computing},
+  pages={261--270},
+  year={2015}
+}
 % Trevor's relaxed FIFO list
 @inproceedings{alistarh2018relaxed,
 …
   year={2007}
+}
+% ===============================================================================
+% Linux Man Pages
+% ===============================================================================
+@manual{open,
+  key        = "open",
+  title      = "open(2) Linux User's Manual",
+  year       = "2020",
+  month      = "February",
+}
+@manual{epoll,
+  key        = "epoll",
+  title      = "epoll(7) Linux User's Manual",
+  year       = "2019",
+  month      = "March",
+}
+@manual{select,
+  key        = "select",
+  title      = "select(7) Linux User's Manual",
+  year       = "2019",
+  month      = "March",
+}
+@misc{io_uring,
+  title   = {Efficient IO with io\_uring},
+  author  = {Axboe, Jens},
+  year    = "2019",
+  month   = "March",
+  version = {0,4},
+  howpublished = {\url{https://kernel.dk/io_uring.pdf}}
+}
+@misc{libuv,
+  key   = "libuv",
+  title = {libuv},
+  howpublished = {\url{https://github.com/libuv/libuv}}
+}
+% ===============================================================================
+% MISC
+% ===============================================================================
+@misc{nginx-design,
+  key   = "nginx",
+  title={Inside {NGINX}: How We Designed for Performance \& Scale},
+  howpublished= {\href{https://www.nginx.com/blog/inside-nginx-how-we-designed-for-performance-scale}
+                {https://\-www.nginx.com/\-blog/\-inside\--nginx\--how\--we\--designed\--for\--performance\--scale}},
+}
+@article{schillings1996engineering,
+  title={Be engineering insights: Benaphores},
+  author={Schillings, Benoit},
+  journal={Be Newsletters},
+  volume={1},
+  number={26},
+  year={1996}
+}
+@misc{wiki:thunderherd,
+   author = "{Wikipedia contributors}",
+   title = "Thundering herd problem --- {W}ikipedia{,} The Free Encyclopedia",
+   year = "2020",
+   howpublished = {\href{https://en.wikipedia.org/wiki/Thundering_herd_problem}
+                  {https://\-en.wikipedia.org/\-wiki/\-Thundering\_herd\_problem}},},
+   note = "[Online; accessed 14-April-2020]"
+}
+@misc{wiki:ubuntu-linux,
+   author = "{Wikipedia contributors}",
+   title = "Ubuntu version history : Table of versions --- {W}ikipedia{,} The Free Encyclopedia",
+   year = "2020",
+   howpublished = {\href{https://en.wikipedia.org/wiki/Ubuntu_version_history\#Table_of_versions}
+                  {https://\-en.wikipedia.org/\-wiki/\-Ubuntu\_version\_history\#Table\_of\_versions}},
+   note = "[Online; accessed 15-April-2020]"
+}

driver/cfa.cc

-              re3bc51c
+              rbcd74f3
         } // if
+        string preludedir;
         switch(path) {
         case Installed   : Putenv( argv, "--prelude-dir=" + libdir ); break;
         case BuildTree   : Putenv( argv, "--prelude-dir=" + libdir + "/prelude" ); break;
         case Distributed : Putenv( argv, "--prelude-dir=" + dir(argv[0]) ); break;
+        case Installed   : preludedir = libdir; break;
+        case BuildTree   : preludedir = libdir + "/prelude"; break;
+        case Distributed : preludedir = dir(argv[0]); break;
+        }
+        Putenv( argv, "--prelude-dir=" + preludedir );
+        args[nargs++] = "-include";
+        args[nargs++] = (*new string(preludedir + "/defines.hfa")).c_str();
         for ( int i = 0; i < nlibs; i += 1 ) {                          // copy non-user libraries after all user libraries

libcfa/Makefile.in

re3bc51c	rbcd74f3
106	106	configure.lineno config.status.lineno
107	107	mkinstalldirs = $(install_sh) -d
	108	CONFIG_HEADER = $(top_builddir)/prelude/defines.hfa
108	109	CONFIG_CLEAN_FILES =
109	110	CONFIG_CLEAN_VPATH_FILES =

libcfa/configure

-              re3bc51c
+              rbcd74f3
 enable_distcc
 with_cfa_name
+enable_static
 enable_shared
-enable_static
 with_pic
 enable_fast_install
 …
   --disable-silent-rules  verbose build output (undo: "make V=0")
   --enable-distcc     whether or not to enable distributed compilation
+  --enable-static[=PKGS]  build static libraries [default=no]
   --enable-shared[=PKGS]  build shared libraries [default=yes]
-  --enable-static[=PKGS]  build static libraries [default=yes]
   --enable-fast-install[=PKGS]
                           optimize for fast installation [default=yes]
 …
 } # ac_fn_cxx_try_link
+# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists, giving a warning if it cannot be compiled using
+# the include files in INCLUDES and setting the cache variable VAR
+# accordingly.
+ac_fn_c_check_header_mongrel ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if eval \${$3+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+fi
+eval ac_res=\$$3
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+else
+  # Is the header compilable?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
+$as_echo_n "checking $2 usability... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_header_compiler=yes
+else
+  ac_header_compiler=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
+$as_echo "$ac_header_compiler" >&6; }
+# Is the header present?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
+$as_echo_n "checking $2 presence... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <$2>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  ac_header_preproc=yes
+else
+  ac_header_preproc=no
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
+$as_echo "$ac_header_preproc" >&6; }
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #((
+  yes:no: )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
+$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+  no:yes:* )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
+$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
+$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
+$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
+$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+( $as_echo "## --------------------------------------- ##
+## Report this to cforall@plg.uwaterloo.ca ##
+## --------------------------------------- ##"
+     ) | sed "s/^/$as_me: WARNING:     /" >&2
+    ;;
+esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=\$ac_header_compiler"
+fi
+eval ac_res=\$$3
+               { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+} # ac_fn_c_check_header_mongrel
 cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 …
 # Set options
+# Check whether --enable-static was given.
+if test "${enable_static+set}" = set; then :
+  enableval=$enable_static; p=${PACKAGE-default}
+    case $enableval in
+    yes) enable_static=yes ;;
+    no) enable_static=no ;;
+    *)
+     enable_static=no
+      # Look at the argument we got.  We use all the common list separators.
+      lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
+      for pkg in $enableval; do
+        IFS=$lt_save_ifs
+        if test "X$pkg" = "X$p"; then
+          enable_static=yes
+        fi
+      done
+      IFS=$lt_save_ifs
+      ;;
+    esac
+else
+  enable_static=no
+fi
 …
 fi
-  # Check whether --enable-static was given.
-if test "${enable_static+set}" = set; then :
-  enableval=$enable_static; p=${PACKAGE-default}
-    case $enableval in
-    yes) enable_static=yes ;;
-    no) enable_static=no ;;
-    *)
-     enable_static=no
-      # Look at the argument we got.  We use all the common list separators.
-      lt_save_ifs=$IFS; IFS=$IFS$PATH_SEPARATOR,
-      for pkg in $enableval; do
-        IFS=$lt_save_ifs
-        if test "X$pkg" = "X$p"; then
-          enable_static=yes
-        fi
-      done
-      IFS=$lt_save_ifs
-      ;;
-    esac
-else
-  enable_static=yes
-fi
 …
+for ac_header in linux/io_uring.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "linux/io_uring.h" "ac_cv_header_linux_io_uring_h" "$ac_includes_default"
+if test "x$ac_cv_header_linux_io_uring_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_LINUX_IO_URING_H 1
+_ACEOF
+fi
+done
+for ac_func in preadv2 pwritev2
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+if eval test \"x\$"$as_ac_var"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+fi
+done
 ac_config_files="$ac_config_files Makefile src/Makefile prelude/Makefile"
+ac_config_headers="$ac_config_headers prelude/defines.hfa"
 …
 test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then branch to the quote section.  Otherwise,
+# look for a macro that doesn't take arguments.
+ac_script='
+:mline
+/\\$/{
+ N
+ s,\\\n,,
+ b mline
+}
+t clear
+:clear
+s/^[     ]*#[    ]*define[       ][      ]*\([^  (][^    (]*([^)]*)\)[   ]*\(.*\)/-D\1=\2/g
+t quote
+s/^[     ]*#[    ]*define[       ][      ]*\([^  ][^     ]*\)[   ]*\(.*\)/-D\1=\2/g
+t quote
+b any
+:quote
+s/[      `~#$^&*(){}\\|;'\''"<>?]/\\&/g
+s/\[/\\&/g
+s/\]/\\&/g
+s/\$/$$/g
+H
+:any
+${
+        g
+        s/^\n//
+        s/\n/ /g
+        p
+}
+'
+DEFS=`sed -n "$ac_script" confdefs.h`
+DEFS=-DHAVE_CONFIG_H
 ac_libobjs=
 …
 esac
+case $ac_config_headers in *"
+"*) set x $ac_config_headers; shift; ac_config_headers=$*;;
+esac
 …
 # Files that config.status was made for.
 config_files="$ac_config_files"
+config_headers="$ac_config_headers"
 config_commands="$ac_config_commands"
 …
       --file=FILE[:TEMPLATE]
                    instantiate the configuration file FILE
+      --header=FILE[:TEMPLATE]
+                   instantiate the configuration header FILE
 Configuration files:
 $config_files
+Configuration headers:
+$config_headers
 Configuration commands:
 …
     as_fn_append CONFIG_FILES " '$ac_optarg'"
     ac_need_defaults=false;;
+  --he | --h |  --help | --hel | -h )
+  --header | --heade | --head | --hea )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    as_fn_append CONFIG_HEADERS " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --he | --h)
+    # Conflict between --help and --header
+    as_fn_error $? "ambiguous option: \`$1'
+Try \`$0 --help' for more information.";;
+  --help | --hel | -h )
     $as_echo "$ac_cs_usage"; exit ;;
   -q | -quiet | --quiet | --quie | --qui | --qu | --q \
 …
 macro_version='`$ECHO "$macro_version" | $SED "$delay_single_quote_subst"`'
 macro_revision='`$ECHO "$macro_revision" | $SED "$delay_single_quote_subst"`'
+enable_static='`$ECHO "$enable_static" | $SED "$delay_single_quote_subst"`'
 enable_shared='`$ECHO "$enable_shared" | $SED "$delay_single_quote_subst"`'
-enable_static='`$ECHO "$enable_static" | $SED "$delay_single_quote_subst"`'
 pic_mode='`$ECHO "$pic_mode" | $SED "$delay_single_quote_subst"`'
 enable_fast_install='`$ECHO "$enable_fast_install" | $SED "$delay_single_quote_subst"`'
 …
     "src/Makefile") CONFIG_FILES="$CONFIG_FILES src/Makefile" ;;
     "prelude/Makefile") CONFIG_FILES="$CONFIG_FILES prelude/Makefile" ;;
+    "prelude/defines.hfa") CONFIG_HEADERS="$CONFIG_HEADERS prelude/defines.hfa" ;;
   *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
 …
 if $ac_need_defaults; then
   test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+  test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers
   test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands
 fi
 …
 fi # test -n "$CONFIG_FILES"
+eval set X "  :F $CONFIG_FILES      :C $CONFIG_COMMANDS"
+# Set up the scripts for CONFIG_HEADERS section.
+# No need to generate them if there are no CONFIG_HEADERS.
+# This happens for instance with `./config.status Makefile'.
+if test -n "$CONFIG_HEADERS"; then
+cat >"$ac_tmp/defines.awk" <<\_ACAWK ||
+BEGIN {
+_ACEOF
+# Transform confdefs.h into an awk script `defines.awk', embedded as
+# here-document in config.status, that substitutes the proper values into
+# config.h.in to produce config.h.
+# Create a delimiter string that does not exist in confdefs.h, to ease
+# handling of long lines.
+ac_delim='%!_!# '
+for ac_last_try in false false :; do
+  ac_tt=`sed -n "/$ac_delim/p" confdefs.h`
+  if test -z "$ac_tt"; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_HEADERS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+# For the awk script, D is an array of macro values keyed by name,
+# likewise P contains macro parameters if any.  Preserve backslash
+# newline sequences.
+ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]*
+sed -n '
+s/.\{148\}/&'"$ac_delim"'/g
+t rset
+:rset
+s/^[     ]*#[    ]*define[       ][      ]*/ /
+t def
+d
+:def
+s/\\$//
+t bsnl
+s/["\\]/\\&/g
+s/^ \('"$ac_word_re"'\)\(([^()]*)\)[     ]*\(.*\)/P["\1"]="\2"\
+D["\1"]=" \3"/p
+s/^ \('"$ac_word_re"'\)[         ]*\(.*\)/D["\1"]=" \2"/p
+d
+:bsnl
+s/["\\]/\\&/g
+s/^ \('"$ac_word_re"'\)\(([^()]*)\)[     ]*\(.*\)/P["\1"]="\2"\
+D["\1"]=" \3\\\\\\n"\\/p
+t cont
+s/^ \('"$ac_word_re"'\)[         ]*\(.*\)/D["\1"]=" \2\\\\\\n"\\/p
+t cont
+d
+:cont
+n
+s/.\{148\}/&'"$ac_delim"'/g
+t clear
+:clear
+s/\\$//
+t bsnlc
+s/["\\]/\\&/g; s/^/"/; s/$/"/p
+d
+:bsnlc
+s/["\\]/\\&/g; s/^/"/; s/$/\\\\\\n"\\/p
+b cont
+' <confdefs.h | sed '
+s/'"$ac_delim"'/"\\\
+"/g' >>$CONFIG_STATUS || ac_write_fail=1
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  for (key in D) D_is_set[key] = 1
+  FS = ""
+}
+/^[\t ]*#[\t ]*(define|undef)[\t ]+$ac_word_re([\t (]|\$)/ {
+  line = \$ 0
+  split(line, arg, " ")
+  if (arg[1] == "#") {
+    defundef = arg[2]
+    mac1 = arg[3]
+  } else {
+    defundef = substr(arg[1], 2)
+    mac1 = arg[2]
+  }
+  split(mac1, mac2, "(") #)
+  macro = mac2[1]
+  prefix = substr(line, 1, index(line, defundef) - 1)
+  if (D_is_set[macro]) {
+    # Preserve the white space surrounding the "#".
+    print prefix "define", macro P[macro] D[macro]
+    next
+  } else {
+    # Replace #undef with comments.  This is necessary, for example,
+    # in the case of _POSIX_SOURCE, which is predefined and required
+    # on some systems where configure will not decide to define it.
+    if (defundef == "undef") {
+      print "/*", prefix defundef, macro, "*/"
+      next
+    }
+  }
+}
+{ print }
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+  as_fn_error $? "could not setup config headers machinery" "$LINENO" 5
+fi # test -n "$CONFIG_HEADERS"
+eval set X "  :F $CONFIG_FILES  :H $CONFIG_HEADERS    :C $CONFIG_COMMANDS"
 shift
 for ac_tag
 …
   || as_fn_error $? "could not create $ac_file" "$LINENO" 5
  ;;
+  :H)
+  #
+  # CONFIG_HEADER
+  #
+  if test x"$ac_file" != x-; then
+    {
+      $as_echo "/* $configure_input  */" \
+      && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs"
+    } >"$ac_tmp/config.h" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+    if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5
+$as_echo "$as_me: $ac_file is unchanged" >&6;}
+    else
+      rm -f "$ac_file"
+      mv "$ac_tmp/config.h" "$ac_file" \
+        || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+    fi
+  else
+    $as_echo "/* $configure_input  */" \
+      && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \
+      || as_fn_error $? "could not create -" "$LINENO" 5
+  fi
+# Compute "$ac_file"'s index in $config_headers.
+_am_arg="$ac_file"
+_am_stamp_count=1
+for _am_header in $config_headers :; do
+  case $_am_header in
+    $_am_arg | $_am_arg:* )
+      break ;;
+    * )
+      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
+  esac
+done
+echo "timestamp for $_am_arg" >`$as_dirname -- "$_am_arg" ||
+$as_expr X"$_am_arg" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+         X"$_am_arg" : 'X\(//\)[^/]' \| \
+         X"$_am_arg" : 'X\(//\)$' \| \
+         X"$_am_arg" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$_am_arg" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+            s//\1/
+            q
+          }
+          /^X\(\/\/\)[^/].*/{
+            s//\1/
+            q
+          }
+          /^X\(\/\/\)$/{
+            s//\1/
+            q
+          }
+          /^X\(\/\).*/{
+            s//\1/
+            q
+          }
+          s/.*/./; q'`/stamp-h$_am_stamp_count
+ ;;
   :C)  { $as_echo "$as_me:${as_lineno-$LINENO}: executing $ac_file commands" >&5
 …
 macro_revision=$macro_revision
+# Whether or not to build static libraries.
+build_old_libs=$enable_static
 # Whether or not to build shared libraries.
 build_libtool_libs=$enable_shared
-# Whether or not to build static libraries.
-build_old_libs=$enable_static
 # What type of objects to build.

libcfa/configure.ac

-              re3bc51c
+              rbcd74f3
 # Checks for programs.
 LT_INIT
+LT_INIT([disable-static])
 AC_PROG_CXX
 …
 AC_PROG_MAKE_SET
+AC_CHECK_HEADERS([linux/io_uring.h])
+AC_CHECK_FUNCS([preadv2 pwritev2])
 AC_CONFIG_FILES([
         Makefile
 …
         ])
+AC_CONFIG_HEADERS(prelude/defines.hfa)
 AC_OUTPUT()

libcfa/prelude/Makefile.am

re3bc51c	rbcd74f3
21	21	# put into lib for now
22	22	cfalibdir = ${CFA_LIBDIR}
23		cfalib_DATA = gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c
	23	cfalib_DATA = gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c defines.hfa
24	24
25	25	CC = @LOCAL_CFACC@

libcfa/prelude/Makefile.in

-              re3bc51c
+              rbcd74f3
 # Makefile.in generated by automake 1.16.1 from Makefile.am.
+# Makefile.in generated by automake 1.15 from Makefile.am.
 # @configure_input@
 # Copyright (C) 1994-2018 Free Software Foundation, Inc.
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
 # This Makefile.in is free software; the Free Software Foundation
 …
 DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
 mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = defines.hfa
 CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 …
 am__installdirs = "$(DESTDIR)$(cfalibdir)"
 DATA = $(cfalib_DATA)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+am__DIST_COMMON = $(srcdir)/Makefile.in
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) \
+        $(LISP)defines.hfa.in
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/defines.hfa.in
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
 …
 # put into lib for now
 cfalibdir = ${CFA_LIBDIR}
 cfalib_DATA = gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c
+cfalib_DATA = gcc-builtins.cf builtins.cf extras.cf prelude.cfa bootloader.c defines.hfa
 AM_CFLAGS = -g -Wall -Wno-unused-function -fPIC @ARCH_FLAGS@ @CONFIG_CFLAGS@
 AM_CFAFLAGS = @CONFIG_CFAFLAGS@
 MOSTLYCLEANFILES = bootloader.c builtins.cf extras.cf gcc-builtins.c gcc-builtins.cf prelude.cfa
 MAINTAINERCLEANFILES = ${addprefix ${libdir}/,${cfalib_DATA}} ${addprefix ${libdir}/,${lib_LIBRARIES}}
+all: all-am
+all: defines.hfa
+        $(MAKE) $(AM_MAKEFLAGS) all-am
 .SUFFIXES:
 …
             cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
           *) \
             echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
             cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
+            echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+            cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
         esac;
 …
         cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
+defines.hfa: stamp-h1
+        @test -f $@ || rm -f stamp-h1
+        @test -f $@ || $(MAKE) $(AM_MAKEFLAGS) stamp-h1
+stamp-h1: $(srcdir)/defines.hfa.in $(top_builddir)/config.status
+        @rm -f stamp-h1
+        cd $(top_builddir) && $(SHELL) ./config.status prelude/defines.hfa
+$(srcdir)/defines.hfa.in:  $(am__configure_deps)
+        ($(am__cd) $(top_srcdir) && $(AUTOHEADER))
+        rm -f stamp-h1
+        touch $@
+distclean-hdr:
+        -rm -f defines.hfa stamp-h1
 mostlyclean-libtool:
 …
         files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
         dir='$(DESTDIR)$(cfalibdir)'; $(am__uninstall_files_from_dir)
+tags TAGS:
+ctags CTAGS:
+cscope cscopelist:
+distdir: $(BUILT_SOURCES)
+        $(MAKE) $(AM_MAKEFLAGS) distdir-am
+distdir-am: $(DISTFILES)
+ID: $(am__tagged_files)
+        $(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+        set x; \
+        here=`pwd`; \
+        $(am__define_uniq_tagged_files); \
+        shift; \
+        if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+          test -n "$$unique" || unique=$$empty_fix; \
+          if test $$# -gt 0; then \
+            $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+              "$$@" $$unique; \
+          else \
+            $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+              $$unique; \
+          fi; \
+        fi
+ctags: ctags-am
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+        $(am__define_uniq_tagged_files); \
+        test -z "$(CTAGS_ARGS)$$unique" \
+          || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+             $$unique
+GTAGS:
+        here=`$(am__cd) $(top_builddir) && pwd` \
+          && $(am__cd) $(top_srcdir) \
+          && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+cscopelist-am: $(am__tagged_files)
+        list='$(am__tagged_files)'; \
+        case "$(srcdir)" in \
+          [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+          *) sdir=$(subdir)/$(srcdir) ;; \
+        esac; \
+        for i in $$list; do \
+          if test -f "$$i"; then \
+            echo "$(subdir)/$$i"; \
+          else \
+            echo "$$sdir/$$i"; \
+          fi; \
+        done >> $(top_builddir)/cscope.files
+distclean-tags:
+        -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+distdir: $(DISTFILES)
         @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
         topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
 …
 check-am: all-am
 check: check-am
 all-am: Makefile $(DATA)
+all-am: Makefile $(DATA) defines.hfa
 installdirs:
         for dir in "$(DESTDIR)$(cfalibdir)"; do \
 …
 distclean: distclean-am
         -rm -f Makefile
 distclean-am: clean-am distclean-generic
+distclean-am: clean-am distclean-generic distclean-hdr distclean-tags
 dvi: dvi-am
 …
 uninstall-am: uninstall-cfalibDATA
+.MAKE: install-am install-strip
+.PHONY: all all-am check check-am clean clean-generic clean-libtool \
+        cscopelist-am ctags-am distclean distclean-generic \
+        distclean-libtool distdir dvi dvi-am html html-am info info-am \
+.MAKE: all install-am install-strip
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+        clean-libtool cscopelist-am ctags ctags-am distclean \
+        distclean-generic distclean-hdr distclean-libtool \
+        distclean-tags distdir dvi dvi-am html html-am info info-am \
         install install-am install-cfalibDATA install-data \
         install-data-am install-dvi install-dvi-am install-exec \
 …
         maintainer-clean-generic maintainer-clean-local mostlyclean \
         mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
         tags-am uninstall uninstall-am uninstall-cfalibDATA
+        tags tags-am uninstall uninstall-am uninstall-cfalibDATA
 .PRECIOUS: Makefile

libcfa/src/Makefile.am

-              re3bc51c
+              rbcd74f3
 # The built sources must not depend on the installed headers
 AM_CFAFLAGS = -quiet -cfalib -I$(srcdir)/stdhdr $(if $(findstring ${gdbwaittarget}, ${@}), -XCFA --gdb) @CONFIG_CFAFLAGS@
 AM_CFLAGS = -g -Wall -Wno-unused-function -fPIC -pthread @ARCH_FLAGS@ @CONFIG_CFLAGS@
+AM_CFLAGS = -g -Wall -Wno-unused-function -fPIC -fexceptions -pthread @ARCH_FLAGS@ @CONFIG_CFLAGS@
 AM_CCASFLAGS = -g -Wall -Wno-unused-function @ARCH_FLAGS@ @CONFIG_CFLAGS@
 CFACC = @CFACC@
 …
 #----------------------------------------------------------------------------------------------------------------
 if BUILDLIB
 headers_nosrc = bitmanip.hfa math.hfa gmp.hfa time_t.hfa bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa
+headers_nosrc = bitmanip.hfa math.hfa gmp.hfa time_t.hfa bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa containers/list.hfa
 headers = fstream.hfa iostream.hfa iterator.hfa limits.hfa rational.hfa time.hfa stdlib.hfa common.hfa \
           containers/maybe.hfa containers/pair.hfa containers/result.hfa containers/vector.hfa
 …
 thread_headers_nosrc = concurrency/invoke.h
 thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
 thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
 else
 headers =

libcfa/src/Makefile.in

-              re3bc51c
+              rbcd74f3
         $(am__nobase_cfa_include_HEADERS_DIST) $(am__DIST_COMMON)
 mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = $(top_builddir)/prelude/defines.hfa
 CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 …
 am__libcfathread_la_SOURCES_DIST =  \
         concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa \
         concurrency/invoke.c concurrency/preemption.cfa \
         concurrency/coroutine.cfa concurrency/thread.cfa \
         concurrency/kernel.cfa concurrency/monitor.cfa \
         concurrency/mutex.cfa
+        concurrency/invoke.c concurrency/io.cfa \
+        concurrency/preemption.cfa concurrency/coroutine.cfa \
+        concurrency/thread.cfa concurrency/kernel.cfa \
+        concurrency/monitor.cfa concurrency/mutex.cfa
 @BUILDLIB_TRUE@am__objects_3 = concurrency/coroutine.lo \
 @BUILDLIB_TRUE@ concurrency/thread.lo concurrency/kernel.lo \
 …
 @BUILDLIB_TRUE@ concurrency/CtxSwitch-@ARCHITECTURE@.lo \
 @BUILDLIB_TRUE@ concurrency/alarm.lo concurrency/invoke.lo \
+@BUILDLIB_TRUE@ concurrency/preemption.lo $(am__objects_3)
+@BUILDLIB_TRUE@ concurrency/io.lo concurrency/preemption.lo \
+@BUILDLIB_TRUE@ $(am__objects_3)
 am_libcfathread_la_OBJECTS = $(am__objects_4)
 libcfathread_la_OBJECTS = $(am_libcfathread_la_OBJECTS)
 …
 am__v_at_0 = @
 am__v_at_1 =
 DEFAULT_INCLUDES = -I.@am__isrc@
+DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)/prelude
 depcomp = $(SHELL) $(top_srcdir)/automake/depcomp
 am__depfiles_maybe = depfiles
 …
         containers/vector.hfa bitmanip.hfa math.hfa gmp.hfa time_t.hfa \
         bits/align.hfa bits/containers.hfa bits/defs.hfa \
         bits/debug.hfa bits/locks.hfa concurrency/coroutine.hfa \
         concurrency/thread.hfa concurrency/kernel.hfa \
         concurrency/monitor.hfa concurrency/mutex.hfa \
         concurrency/invoke.h
+        bits/debug.hfa bits/locks.hfa containers/list.hfa \
+        concurrency/coroutine.hfa concurrency/thread.hfa \
+        concurrency/kernel.hfa concurrency/monitor.hfa \
+        concurrency/mutex.hfa concurrency/invoke.h
 HEADERS = $(nobase_cfa_include_HEADERS)
 am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
 …
 # The built sources must not depend on the installed headers
 AM_CFAFLAGS = -quiet -cfalib -I$(srcdir)/stdhdr $(if $(findstring ${gdbwaittarget}, ${@}), -XCFA --gdb) @CONFIG_CFAFLAGS@
 AM_CFLAGS = -g -Wall -Wno-unused-function -fPIC -pthread @ARCH_FLAGS@ @CONFIG_CFLAGS@
+AM_CFLAGS = -g -Wall -Wno-unused-function -fPIC -fexceptions -pthread @ARCH_FLAGS@ @CONFIG_CFLAGS@
 AM_CCASFLAGS = -g -Wall -Wno-unused-function @ARCH_FLAGS@ @CONFIG_CFLAGS@
 @BUILDLIB_FALSE@headers_nosrc =
 #----------------------------------------------------------------------------------------------------------------
 @BUILDLIB_TRUE@headers_nosrc = bitmanip.hfa math.hfa gmp.hfa time_t.hfa bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa
+@BUILDLIB_TRUE@headers_nosrc = bitmanip.hfa math.hfa gmp.hfa time_t.hfa bits/align.hfa bits/containers.hfa bits/defs.hfa bits/debug.hfa bits/locks.hfa containers/list.hfa
 @BUILDLIB_FALSE@headers =
 @BUILDLIB_TRUE@headers = fstream.hfa iostream.hfa iterator.hfa limits.hfa rational.hfa time.hfa stdlib.hfa common.hfa \
 …
 @BUILDLIB_FALSE@thread_headers =
 @BUILDLIB_TRUE@thread_headers = concurrency/coroutine.hfa concurrency/thread.hfa concurrency/kernel.hfa concurrency/monitor.hfa concurrency/mutex.hfa
 @BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
+@BUILDLIB_TRUE@thread_libsrc = concurrency/CtxSwitch-@ARCHITECTURE@.S concurrency/alarm.cfa concurrency/invoke.c concurrency/io.cfa concurrency/preemption.cfa ${thread_headers:.hfa=.cfa}
 #----------------------------------------------------------------------------------------------------------------
 …
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/invoke.lo: concurrency/$(am__dirstamp) \
+        concurrency/$(DEPDIR)/$(am__dirstamp)
+concurrency/io.lo: concurrency/$(am__dirstamp) \
         concurrency/$(DEPDIR)/$(am__dirstamp)
 concurrency/preemption.lo: concurrency/$(am__dirstamp) \

libcfa/src/bitmanip.hfa

-              re3bc51c
+              rbcd74f3
 // Created On       : Sat Mar 14 18:12:27 2020
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Mar 16 14:28:46 2020
 // Update Count     : 49
+// Last Modified On : Sun Apr 19 22:29:58 2020
+// Update Count     : 121
 //
 …
 // Bits are numbered 1-N.
+#include <assert.h>
+//#include <assert.h>
+#define __bitsizeof( n ) (sizeof(n) * __CHAR_BIT__)
 static inline {
     // Count leading 0 bits.
     unsigned int cl0( unsigned char n ) { return n != 0 ? __builtin_clz( n ) - (sizeof(unsigned int) * __CHAR_BIT__ - sizeof(n) * __CHAR_BIT__) : sizeof(n) * __CHAR_BIT__; }
     unsigned int cl0( unsigned short int n ) { return n != 0 ? __builtin_clz( n ) - (sizeof(unsigned int) * __CHAR_BIT__ - sizeof(n) * __CHAR_BIT__) : sizeof(n) * __CHAR_BIT__; }
     unsigned int cl0( unsigned int n ) { return n != 0 ? __builtin_clz( n ) : sizeof(n) * __CHAR_BIT__; }
     unsigned int cl0( unsigned long int n ) { return n != 0 ? __builtin_clzl( n ) : sizeof(n) * __CHAR_BIT__; }
     unsigned int cl0( unsigned long long int n ) { return n != 0 ? __builtin_clzll( n ) : sizeof(n) * __CHAR_BIT__; }
+        // Count leading 0 bits.
+        unsigned int leading0s( unsigned char n ) { return n != 0 ? __builtin_clz( n ) - (__bitsizeof(unsigned int) - __bitsizeof(n)) : __bitsizeof(n); }
+        unsigned int leading0s( unsigned short int n ) { return n != 0 ? __builtin_clz( n ) - (__bitsizeof(unsigned int) - __bitsizeof(n)) : __bitsizeof(n); }
+        unsigned int leading0s( unsigned int n ) { return n != 0 ? __builtin_clz( n ) : __bitsizeof(n); }
+        unsigned int leading0s( unsigned long int n ) { return n != 0 ? __builtin_clzl( n ) : __bitsizeof(n); }
+        unsigned int leading0s( unsigned long long int n ) { return n != 0 ? __builtin_clzll( n ) : __bitsizeof(n); }
     // Count trailing 0 bits.
     unsigned int ct0( unsigned char n ) { return n != 0 ? __builtin_ctz( n ) : sizeof(n) * __CHAR_BIT__; }
     unsigned int ct0( unsigned short int n ) { return n != 0 ? __builtin_ctz( n ) : sizeof(n) * __CHAR_BIT__; }
     unsigned int ct0( unsigned int n ) { return n != 0 ? __builtin_ctz( n ) : sizeof(n) * __CHAR_BIT__; }
     unsigned int ct0( unsigned long int n ) { return n != 0 ? __builtin_ctzl( n ) : sizeof(n) * __CHAR_BIT__; }
     unsigned int ct0( unsigned long long int n ) { return n != 0 ? __builtin_ctzll( n ) : sizeof(n) * __CHAR_BIT__; }
+        // Count trailing 0 bits.
+        unsigned int trailing0s( unsigned char n ) { return n != 0 ? __builtin_ctz( n ) : __bitsizeof(n); }
+        unsigned int trailing0s( unsigned short int n ) { return n != 0 ? __builtin_ctz( n ) : __bitsizeof(n); }
+        unsigned int trailing0s( unsigned int n ) { return n != 0 ? __builtin_ctz( n ) : __bitsizeof(n); }
+        unsigned int trailing0s( unsigned long int n ) { return n != 0 ? __builtin_ctzl( n ) : __bitsizeof(n); }
+        unsigned int trailing0s( unsigned long long int n ) { return n != 0 ? __builtin_ctzll( n ) : __bitsizeof(n); }
     // Count all 1 bits.
     unsigned int ca1( unsigned char n ) { return __builtin_popcount( n ); }
     unsigned int ca1( unsigned short int n ) { return __builtin_popcount( n ); }
     unsigned int ca1( unsigned int n ) { return __builtin_popcount( n ); }
     unsigned int ca1( unsigned long int n ) { return __builtin_popcountl( n ); }
     unsigned int ca1( unsigned long long int n ) { return __builtin_popcountll( n ); }
+        // Count all 1 bits.
+        unsigned int all1s( unsigned char n ) { return __builtin_popcount( n ); }
+        unsigned int all1s( unsigned short int n ) { return __builtin_popcount( n ); }
+        unsigned int all1s( unsigned int n ) { return __builtin_popcount( n ); }
+        unsigned int all1s( unsigned long int n ) { return __builtin_popcountl( n ); }
+        unsigned int all1s( unsigned long long int n ) { return __builtin_popcountll( n ); }
     // Count all 0 bits.
     unsigned int ca0( unsigned char n ) { return sizeof(n) * __CHAR_BIT__ - __builtin_popcount( n ); }
     unsigned int ca0( unsigned short int n ) { return sizeof(n) * __CHAR_BIT__ - __builtin_popcount( n ); }
     unsigned int ca0( unsigned int n ) { return sizeof(n) * __CHAR_BIT__ - __builtin_popcount( n ); }
     unsigned int ca0( unsigned long int n ) { return sizeof(n) * __CHAR_BIT__ - __builtin_popcountl( n ); }
     unsigned int ca0( unsigned long long int n ) { return sizeof(n) * __CHAR_BIT__ - __builtin_popcountll( n ); }
+        // Count all 0 bits.
+        unsigned int all0s( unsigned char n ) { return __bitsizeof(n) - __builtin_popcount( n ); }
+        unsigned int all0s( unsigned short int n ) { return __bitsizeof(n) - __builtin_popcount( n ); }
+        unsigned int all0s( unsigned int n ) { return __bitsizeof(n) - __builtin_popcount( n ); }
+        unsigned int all0s( unsigned long int n ) { return __bitsizeof(n) - __builtin_popcountl( n ); }
+        unsigned int all0s( unsigned long long int n ) { return __bitsizeof(n) - __builtin_popcountll( n ); }
+    // Find least significiant set bit. (ffs)
+    unsigned int fls( unsigned int n ) { return __builtin_ffs( n ); }
+    unsigned int fls( unsigned long int n ) { return __builtin_ffsl( n ); }
+    unsigned int fls( unsigned long long int n ) { return __builtin_ffsll( n ); }
+        // Find least significiant zero bit. (ffs)
+        unsigned int low0( unsigned char n ) { return __builtin_ffs( (typeof(n))~n ); }
+        unsigned int low0( unsigned short int n ) { return __builtin_ffs( (typeof(n))~n ); }
+        unsigned int low0( unsigned int n ) { return __builtin_ffs( ~n ); }
+        unsigned int low0( unsigned long int n ) { return __builtin_ffsl( ~n ); }
+        unsigned int low0( unsigned long long int n ) { return __builtin_ffsll( ~n ); }
+    // Find most significiant set bit.
+    unsigned int fms( unsigned char n ) { return n != 0 ? sizeof(unsigned int) * __CHAR_BIT__ - __builtin_clz( n ) : 0; }
+    unsigned int fms( unsigned short int n ) { return n != 0 ? sizeof(unsigned int) * __CHAR_BIT__ - __builtin_clz( n ) : 0; }
+    unsigned int fms( unsigned int n ) { return n != 0 ? sizeof(n) * __CHAR_BIT__ - __builtin_clz( n ) : 0; }
+    unsigned int fms( unsigned long int n ) { return n != 0 ? sizeof(n) * __CHAR_BIT__ - __builtin_clzl( n ) : 0; }
+    unsigned int fms( unsigned long long int n ) { return n != 0 ? sizeof(n) * __CHAR_BIT__ - __builtin_clzll( n ) : 0; }
+        // Find least significiant one bit.
+        unsigned int low1( unsigned int n ) { return __builtin_ffs( n ); }
+        unsigned int low1( unsigned long int n ) { return __builtin_ffsl( n ); }
+        unsigned int low1( unsigned long long int n ) { return __builtin_ffsll( n ); }
+    // Check for power of 2
+    bool pow2( unsigned long int value ) {
+                return (value & (value - 1)) == 0;                              // clears bits below value, rounding down to the next lower multiple of value
+    } // pow2
+        // Find most significiant zero bit.
+        unsigned int high0( unsigned char n ) { return n == (typeof(n))-1 ? 0 : __bitsizeof(unsigned int) - __builtin_clz( (typeof(n))~n ); }
+        unsigned int high0( unsigned short int n ) { return n == (typeof(n))-1 ? 0 : __bitsizeof(unsigned int) - __builtin_clz( (typeof(n))~n ); }
+        unsigned int high0( unsigned int n ) { return n == -1 ? 0 : __bitsizeof(n) - __builtin_clz( ~n ); }
+        unsigned int high0( unsigned long int n ) { return n == -1 ? 0 : __bitsizeof(n) - __builtin_clzl( ~n ); }
+        unsigned int high0( unsigned long long int n ) { return n == -1 ? 0 : __bitsizeof(n) - __builtin_clzll( ~n ); }
+    // Returns value aligned at the floor of align.
+    unsigned long int floor( unsigned long int value, unsigned long int align ) {
+                assert( pow2( align ) );
+                return value & -align;                                                  // clear bits above or equal to align, giving value % align
+    } // floor
+        // Find most significiant one bit.
+        unsigned int high1( unsigned char n ) { return n == 0 ? 0 : __bitsizeof(unsigned int) - __builtin_clz( n ); }
+        unsigned int high1( unsigned short int n ) { return n == 0 ? 0 : __bitsizeof(unsigned int) - __builtin_clz( n ); }
+        unsigned int high1( unsigned int n ) { return n == 0 ? 0 : __bitsizeof(n) - __builtin_clz( n ); }
+        unsigned int high1( unsigned long int n ) { return n == 0 ? 0 : __bitsizeof(n) - __builtin_clzl( n ); }
+        unsigned int high1( unsigned long long int n ) { return n == 0 ? 0 : __bitsizeof(n) - __builtin_clzll( n ); }
+    // Returns value aligned at the ceiling of align.
+    unsigned long int ceiling( unsigned long int value, unsigned long int align ) {
+                assert( pow2( align ) );
+                return -floor( -value, align );                                 // negate, round down, negate is the same as round up
+    } // ceiling
+}
+        // Check for power of 2, clears bits below n, rounding down to the next lower multiple of n.  0 is not a power of 2
+        // but this computation returns true because of the two's complement, so it is a special case.
+        bool is_pow2( unsigned char n ) { return n == 0 ? false : (n & (n - 1)) == 0; }
+        bool is_pow2( unsigned short int n ) { return n == 0 ? false : (n & (n - 1)) == 0; }
+        bool is_pow2( unsigned int n ) { return n == 0 ? false : (n & (n - 1)) == 0; }
+        bool is_pow2( unsigned long int n ) { return n == 0 ? false : (n & (n - 1)) == 0; }
+        bool is_pow2( unsigned long long int n ) { return n == 0 ? false : (n & (n - 1)) == 0; }
+        // Returns n aligned at the floor of align, clear bits above or equal to align, giving n % align.
+        signed char floor2( signed char n, char align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        unsigned char floor2( unsigned char n, unsigned char align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        short int floor2( short int n, short int align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        unsigned short int floor2( unsigned short int n, unsigned short int align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        int floor2( int n, int align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        unsigned int floor2( unsigned int n, unsigned int align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        long int floor2( long int n, long int align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        unsigned long int floor2( unsigned long int n, unsigned long int align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        long long int floor2( long long int n, long long int align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        unsigned long long int floor2( unsigned long long int n, unsigned long long int align ) { /*assert( is_pow2( align ) );*/ return n & -align; }
+        // forall( otype T | { T ?&?( T, T ); T -?( T ); } )
+        // T floor2( T n, T align ) { /* assert( is_pow2( align ) ); */ return n & -align; }
+        signed char floor( signed char n, char align ) { return n / align * align; }
+        unsigned char floor( unsigned char n, unsigned char align ) { return n / align * align; }
+        short int floor( short int n, short int align ) { return n / align * align; }
+        unsigned short int floor( unsigned short int n, unsigned short int align ) { return n / align * align; }
+        int floor( int n, int align ) { return n / align * align; }
+        unsigned int floor( unsigned int n, unsigned int align ) { return n / align * align; }
+        long int floor( long int n, long int align ) { return n / align * align; }
+        unsigned long int floor( unsigned long int n, unsigned long int align ) { return n / align * align; }
+        long long int floor( long long int n, long long int align ) { return n / align * align; }
+        unsigned long long int floor( unsigned long long int n, unsigned long long int align ) { return n / align * align; }
+        // forall( otype T | { T ?/?( T, T ); T ?*?( T, T ); } )
+        // T floor( T n, T align ) { return n / align * align; }
+        // Returns n aligned at the ceiling of align, negate, round down, negate is the same as round up.
+        signed char ceiling2( signed char n, char align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        unsigned char ceiling2( unsigned char n, unsigned char align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        short int ceiling2( short int n, short int align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        unsigned short int ceiling2( unsigned short int n, unsigned short int align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        int ceiling2( int n, int align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        unsigned int ceiling2( unsigned int n, unsigned int align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        long int ceiling2( long int n, long int align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        unsigned long int ceiling2( unsigned long int n, unsigned long int align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        long long int ceiling2( long long int n, long long int align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        unsigned long long int ceiling2( unsigned long long int n, unsigned long long int align ) { /*assert( is_pow2( align ) );*/ return -floor2( -n, align ); }
+        // forall( otype T | { T floor2( T, T ); T -?( T ); } )
+        // T ceiling2( T n, T align ) { /* assert( is_pow2( align ) ); */ return -floor2( -n, align ); }
+        signed char ceiling( signed char n, char align ) { return (n + (align - 1)) / align; }
+        unsigned char ceiling( unsigned char n, unsigned char align ) { return (n + (align - 1)) / align; }
+        short int ceiling( short int n, short int align ) { return (n + (align - 1)) / align; }
+        unsigned short int ceiling( unsigned short int n, unsigned short int align ) { return (n + (align - 1)) / align; }
+        int ceiling( int n, int align ) { return (n + (align - 1)) / align; }
+        unsigned int ceiling( unsigned int n, unsigned int align ) { return (n + (align - 1)) / align; }
+        long int ceiling( long int n, long int align ) { return (n + (align - 1)) / align; }
+        unsigned long int ceiling( unsigned long int n, unsigned long int align ) { return (n + (align - 1)) / align; }
+        long long int ceiling( long long int n, long long int align ) { return (n + (align - 1)) / align; }
+        unsigned long long int ceiling( unsigned long long int n, unsigned long long int align ) { return (n + (align - 1)) / align; }
+        // forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T ); T ?/?( T, T ); } )
+        // T ceiling( T n, T align ) { return (n + (align - (T){1})) / align; }
+} // distribution
 // Local Variables: //

libcfa/src/bits/debug.hfa

-              re3bc51c
+              rbcd74f3
 // Author           : Thierry Delisle
 // Created On       : Mon Nov 28 12:27:26 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Feb  4 12:29:21 2020
 // Update Count     : 9
+// Last Modified By : Andrew Beach
+// Last Modified On : Mon Apr 27 10:15:00 2020
+// Update Count     : 10
 //
 …
 #endif
         #include <stdarg.h>
-        #include <stdio.h>
         extern void __cfaabi_bits_write( int fd, const char buffer[], int len );
 …
         extern void __cfaabi_bits_print_vararg( int fd, const char fmt[], va_list arg );
         extern void __cfaabi_bits_print_buffer( int fd, char buffer[], int buffer_size, const char fmt[], ... ) __attribute__(( format(printf, 4, 5) ));
+#if defined(__CFA_DEBUG_PRINT__) \
+                || defined(__CFA_DEBUG_PRINT_IO__) || defined(__CFA_DEBUG_PRINT_IO_CORE__) \
+                || defined(__CFA_DEBUG_PRINT_MONITOR__) || defined(__CFA_DEBUG_PRINT_PREEMPTION__) \
+                || defined(__CFA_DEBUG_PRINT_RUNTIME_CORE__) || defined(__CFA_DEBUG_PRINT_EXCEPTION__)
+        #include <stdio.h>
+        #include <unistd.h>
+#endif
 #ifdef __cforall
+}
 #endif
+// Deprecated: Use the versions with the new module names.
 #ifdef __CFA_DEBUG_PRINT__
         #define __cfaabi_dbg_write( buffer, len )         __cfaabi_bits_write( STDERR_FILENO, buffer, len )
         #define __cfaabi_dbg_acquire()                    __cfaabi_bits_acquire()
         #define __cfaabi_dbg_release()                    __cfaabi_bits_release()
         #define __cfaabi_dbg_print_safe(...)              __cfaabi_bits_print_safe   (__VA_ARGS__)
         #define __cfaabi_dbg_print_nolock(...)            __cfaabi_bits_print_nolock (__VA_ARGS__)
         #define __cfaabi_dbg_print_buffer(...)            __cfaabi_bits_print_buffer (__VA_ARGS__)
         #define __cfaabi_dbg_print_buffer_decl(...)       char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( __dbg_text, __dbg_len );
         #define __cfaabi_dbg_print_buffer_local(...)      __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_dbg_write( __dbg_text, __dbg_len );
+        #define __cfaabi_dbg_print_safe(...)              __cfaabi_bits_print_safe   ( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_nolock(...)            __cfaabi_bits_print_nolock ( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_buffer(...)            __cfaabi_bits_print_buffer ( STDERR_FILENO, __VA_ARGS__ )
+        #define __cfaabi_dbg_print_buffer_decl(...)       char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( STDERR_FILENO, __dbg_text, __dbg_len );
+        #define __cfaabi_dbg_print_buffer_local(...)      __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_dbg_write( STDERR_FILENO, __dbg_text, __dbg_len );
 #else
         #define __cfaabi_dbg_write(...)               ((void)0)
 …
 #endif
+// Debug print functions and statements:
+// Most are wrappers around the bits printing function but are not always used.
+// If they are used depends if the group (first argument) is active or not. The group must be one
+// defined belowe. The other arguments depend on the wrapped function.
+#define __cfadbg_write(group, buffer, len) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_write(STDERR_FILENO, buffer, len))
+#define __cfadbg_acquire(group) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_acquire())
+#define __cfadbg_release(group) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_release())
+#define __cfadbg_print_safe(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_print_safe(STDERR_FILENO, __VA_ARGS__))
+#define __cfadbg_print_nolock(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_print_nolock(STDERR_FILENO, __VA_ARGS__))
+#define __cfadbg_print_buffer(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(__cfaabi_bits_print_buffer(STDERR_FILENO, __VA_ARGS__))
+#define __cfadbg_print_buffer_decl(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(char __dbg_text[256]; int __dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write( __dbg_text, __dbg_len ))
+#define __cfadbg_print_buffer_local(group, ...) \
+        __CFADBG_PRINT_GROUP_##group(__dbg_len = snprintf( __dbg_text, 256, __VA_ARGS__ ); __cfaabi_bits_write(STDERR_FILENO, __dbg_text, __dbg_len))
+// The debug print groups:
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_IO__)
+#       define __CFADBG_PRINT_GROUP_io(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_io(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_IO__) || defined(__CFA_DEBUG_PRINT_IO_CORE__)
+#       define __CFADBG_PRINT_GROUP_io_core(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_io_core(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_MONITOR__)
+#       define __CFADBG_PRINT_GROUP_monitor(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_monitor(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_PREEMPTION__)
+#       define __CFADBG_PRINT_GROUP_preemption(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_preemption(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_RUNTIME_CORE__)
+#       define __CFADBG_PRINT_GROUP_runtime_core(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_runtime_core(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_READY_QUEUE__)
+#       define __CFADBG_PRINT_GROUP_ready_queue(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_ready_queue(...) ((void)0)
+#endif
+#if defined(__CFA_DEBUG_PRINT__) || defined(__CFA_DEBUG_PRINT_EXCEPTION__)
+#       define __CFADBG_PRINT_GROUP_exception(...) __VA_ARGS__
+#else
+#       define __CFADBG_PRINT_GROUP_exception(...) ((void)0)
+#endif
 // Local Variables: //
 // mode: c //

libcfa/src/bits/locks.hfa

-              re3bc51c
+              rbcd74f3
         #endif
+        extern "C" {
+                char * strerror(int);
+        }
+        #define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
         struct __bin_sem_t {
-                bool                    signaled;
                 pthread_mutex_t         lock;
                 pthread_cond_t          cond;
+                int                     val;
         };
         static inline void ?{}(__bin_sem_t & this) with( this ) {
+                signaled = false;
+                pthread_mutex_init(&lock, NULL);
+                pthread_cond_init (&cond, NULL);
+                // Create the mutex with error checking
+                pthread_mutexattr_t mattr;
+                pthread_mutexattr_init( &mattr );
+                pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
+                pthread_mutex_init(&lock, &mattr);
+                pthread_cond_init (&cond, 0p);
+                val = 0;
+        }
         static inline void ^?{}(__bin_sem_t & this) with( this ) {
                 pthread_mutex_destroy(&lock);
                 pthread_cond_destroy (&cond);
+                CHECKED( pthread_mutex_destroy(&lock) );
+                CHECKED( pthread_cond_destroy (&cond) );
+        }
         static inline void wait(__bin_sem_t & this) with( this ) {
                 verify(__cfaabi_dbg_in_kernel());
                 pthread_mutex_lock(&lock);
                         if(!signaled) {   // this must be a loop, not if!
+                CHECKED( pthread_mutex_lock(&lock) );
+                        while(val < 1) {
                                 pthread_cond_wait(&cond, &lock);
+                        }
                         signaled = false;
                 pthread_mutex_unlock(&lock);
+                        val -= 1;
+                CHECKED( pthread_mutex_unlock(&lock) );
+        }
         static inline bool post(__bin_sem_t & this) with( this ) {
+                pthread_mutex_lock(&lock);
+                        bool needs_signal = !signaled;
+                        signaled = true;
+                pthread_mutex_unlock(&lock);
+                bool needs_signal = false;
+                if (needs_signal) pthread_cond_signal(&cond);
+                CHECKED( pthread_mutex_lock(&lock) );
+                        if(val < 1) {
+                                val += 1;
+                                pthread_cond_signal(&cond);
+                                needs_signal = true;
+                        }
+                CHECKED( pthread_mutex_unlock(&lock) );
                 return needs_signal;
+        }
+        #undef CHECKED
 #endif

libcfa/src/bits/signal.hfa

re3bc51c	rbcd74f3
54	54	sig, handler, flags, errno, strerror( errno )
55	55	);
56		_exit( EXIT_FAILURE );
	56	_Exit( EXIT_FAILURE );
57	57	} // if
58	58	}

libcfa/src/concurrency/alarm.cfa

-              re3bc51c
+              rbcd74f3
         this.alarm = alarm;
         this.period = period;
-        next = 0;
         set = false;
         kernel_alarm = false;
 …
         this.alarm = alarm;
         this.period = period;
-        next = 0;
         set = false;
         kernel_alarm = true;
 …
+}
+#if !defined(NDEBUG) && (defined(__CFA_DEBUG__) || defined(__CFA_VERIFY__))
+bool validate( alarm_list_t * this ) {
+        alarm_node_t ** it = &this->head;
+        while( (*it) ) {
+                it = &(*it)->next;
+void insert( alarm_list_t * this, alarm_node_t * n ) {
+        alarm_node_t * it = & (*this)`first;
+        while( it && (n->alarm > it->alarm) ) {
+                it = & (*it)`next;
+        }
+        if ( it ) {
+                insert_before( *it, *n );
+        } else {
+                insert_last(*this, *n);
+        }
+        return it == this->tail;
+}
+#endif
+static inline void insert_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t p ) {
+        verify( !n->next );
+        if( p == this->tail ) {
+                this->tail = &n->next;
+        }
+        else {
+                n->next = *p;
+        }
+        *p = n;
+        verify( validate( this ) );
+}
+void insert( alarm_list_t * this, alarm_node_t * n ) {
+        alarm_node_t ** it = &this->head;
+        while( (*it) && (n->alarm > (*it)->alarm) ) {
+                it = &(*it)->next;
+        }
+        insert_at( this, n, it );
+        verify( validate( this ) );
+        verify( validate( *this ) );
+}
 alarm_node_t * pop( alarm_list_t * this ) {
+        alarm_node_t * head = this->head;
+        verify( validate( *this ) );
+        alarm_node_t * head = & (*this)`first;
         if( head ) {
+                this->head = head->next;
+                if( !head->next ) {
+                        this->tail = &this->head;
+                }
+                head->next = 0p;
+                remove(*head);
+        }
         verify( validate( this ) );
+        verify( validate( *this ) );
         return head;
+}
-static inline void remove_at( alarm_list_t * this, alarm_node_t * n, __alarm_it_t it ) {
-        verify( it );
-        verify( (*it) == n );
-        (*it) = n->next;
-        if( !n-> next ) {
-                this->tail = it;
+        }
-        n->next = 0p;
-        verify( validate( this ) );
+}
-static inline void remove( alarm_list_t * this, alarm_node_t * n ) {
-        alarm_node_t ** it = &this->head;
-        while( (*it) && (*it) != n ) {
-                it = &(*it)->next;
+        }
-        verify( validate( this ) );
-        if( *it ) { remove_at( this, n, it ); }
-        verify( validate( this ) );
+}
 void register_self( alarm_node_t * this ) {
         alarm_list_t * alarms = &event_kernel->alarms;
+        alarm_list_t & alarms = event_kernel->alarms;
         disable_interrupts();
 …
+        {
                 verify( validate( alarms ) );
                 bool first = !alarms->head;
+                bool first = ! & alarms`first;
                 insert( alarms, this );
+                insert( &alarms, this );
                 if( first ) {
                         __kernel_set_timer( alarms->head->alarm - __kernel_get_time() );
+                        __kernel_set_timer( alarms`first.alarm - __kernel_get_time() );
+                }
+        }
 …
         lock( event_kernel->lock __cfaabi_dbg_ctx2 );
+        {
                 verify( validate( &event_kernel->alarms ) );
                 remove( &event_kernel->alarms, this );
+                verify( validate( event_kernel->alarms ) );
+                remove( *this );
+        }
         unlock( event_kernel->lock );
 …
+}
+//=============================================================================================
+// Utilities
+//=============================================================================================
+void sleep( Duration duration ) {
+        alarm_node_t node = { active_thread(), __kernel_get_time() + duration, 0`s };
+        register_self( &node );
+        park( __cfaabi_dbg_ctx );
+        /* paranoid */ verify( !node.set );
+        /* paranoid */ verify( & node`next == 0p );
+        /* paranoid */ verify( & node`prev == 0p );
+}
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/alarm.hfa

-              re3bc51c
+              rbcd74f3
 #include "time.hfa"
+#include <containers/list.hfa>
 struct $thread;
 struct processor;
 …
         Time alarm;                             // time when alarm goes off
         Duration period;                        // if > 0 => period of alarm
+        alarm_node_t * next;            // intrusive link list field
+        DLISTED_MGD_IMPL_IN(alarm_node_t)
         union {
 …
         bool kernel_alarm       :1;             // true if this is not a user defined alarm
 };
+typedef alarm_node_t ** __alarm_it_t;
+DLISTED_MGD_IMPL_OUT(alarm_node_t)
 void ?{}( alarm_node_t & this, $thread * thrd, Time alarm, Duration period );
 …
 void ^?{}( alarm_node_t & this );
+struct alarm_list_t {
+        alarm_node_t * head;
+        __alarm_it_t tail;
+};
+static inline void ?{}( alarm_list_t & this ) with( this ) {
+        head = 0;
+        tail = &head;
+}
+typedef dlist(alarm_node_t, alarm_node_t) alarm_list_t;
 void insert( alarm_list_t * this, alarm_node_t * n );

libcfa/src/concurrency/kernel.cfa

-              re3bc51c
+              rbcd74f3
 #define __cforall_thread__
+// #define __CFA_DEBUG_PRINT_RUNTIME_CORE__
 //C Includes
 …
 #include "invoke.h"
 //-----------------------------------------------------------------------------
 // Some assembly required
 …
         idle{};
         __cfaabi_dbg_print_safe("Kernel : Starting core %p\n", &this);
+        __cfadbg_print_safe(runtime_core, "Kernel : Starting core %p\n", &this);
         this.stack = __create_pthread( &this.kernel_thread, __invoke_processor, (void *)&this );
         __cfaabi_dbg_print_safe("Kernel : core %p started\n", &this);
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p created\n", &this);
+}
 void ^?{}(processor & this) with( this ){
         if( ! __atomic_load_n(&do_terminate, __ATOMIC_ACQUIRE) ) {
                 __cfaabi_dbg_print_safe("Kernel : core %p signaling termination\n", &this);
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p signaling termination\n", &this);
                 __atomic_store_n(&do_terminate, true, __ATOMIC_RELAXED);
 …
+        }
+        pthread_join( kernel_thread, 0p );
+        int err = pthread_join( kernel_thread, 0p );
+        if( err != 0 ) abort("KERNEL ERROR: joining processor %p caused error %s\n", &this, strerror(err));
         free( this.stack );
+}
 void ?{}(cluster & this, const char name[], Duration preemption_rate) with( this ) {
+void ?{}(cluster & this, const char name[], Duration preemption_rate, int io_flags) with( this ) {
         this.name = name;
         this.preemption_rate = preemption_rate;
 …
         ready_queue_lock{};
+        #if !defined(__CFA_NO_STATISTICS__)
+                print_stats = false;
+        #endif
         procs{ __get };
         idles{ __get };
         threads{ __get };
+        __kernel_io_startup( this, io_flags, &this == mainCluster );
         doregister(this);
+}
 void ^?{}(cluster & this) {
+        __kernel_io_shutdown( this, &this == mainCluster );
         unregister(this);
+}
 …
         verify(this);
         __cfaabi_dbg_print_safe("Kernel : core %p starting\n", this);
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p starting\n", this);
         doregister(this->cltr, this);
 …
                 preemption_scope scope = { this };
                 __cfaabi_dbg_print_safe("Kernel : core %p started\n", this);
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p started\n", this);
                 $thread * readyThread = 0p;
 …
+                }
                 __cfaabi_dbg_print_safe("Kernel : core %p stopping\n", this);
+                __cfadbg_print_safe(runtime_core, "Kernel : core %p stopping\n", this);
+        }
 …
         V( this->terminated );
         __cfaabi_dbg_print_safe("Kernel : core %p terminated\n", this);
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p terminated\n", this);
         // HACK : the coroutine context switch expects this_thread to be set
 …
         //We now have a proper context from which to schedule threads
         __cfaabi_dbg_print_safe("Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p created (%p, %p)\n", proc, &proc->runner, &ctx);
         // SKULLDUGGERY: Since the coroutine doesn't have its own stack, we can't
 …
         // Main routine of the core returned, the core is now fully terminated
         __cfaabi_dbg_print_safe("Kernel : core %p main ended (%p)\n", proc, &proc->runner);
+        __cfadbg_print_safe(runtime_core, "Kernel : core %p main ended (%p)\n", proc, &proc->runner);
         return 0p;
 …
+}
+void unpark( $thread * thrd __cfaabi_dbg_ctx_param2 ) {
+        if( !thrd ) return;
+        disable_interrupts();
+// KERNEL ONLY unpark with out disabling interrupts
+void __unpark( $thread * thrd __cfaabi_dbg_ctx_param2 ) {
         static_assert(sizeof(thrd->state) == sizeof(int));
 …
                         abort();
+        }
+}
+void unpark( $thread * thrd __cfaabi_dbg_ctx_param2 ) {
+        if( !thrd ) return;
+        disable_interrupts();
+        __unpark( thrd __cfaabi_dbg_ctx_fwd2 );
         enable_interrupts( __cfaabi_dbg_ctx );
+}
 …
 static void __kernel_startup(void) {
         verify( ! kernelTLS.preemption_state.enabled );
         __cfaabi_dbg_print_safe("Kernel : Starting\n");
+        __cfadbg_print_safe(runtime_core, "Kernel : Starting\n");
         __page_size = sysconf( _SC_PAGESIZE );
 …
         (*mainCluster){"Main Cluster"};
         __cfaabi_dbg_print_safe("Kernel : Main cluster ready\n");
+        __cfadbg_print_safe(runtime_core, "Kernel : Main cluster ready\n");
         // Start by initializing the main thread
 …
         (*mainThread){ &info };
         __cfaabi_dbg_print_safe("Kernel : Main thread ready\n");
+        __cfadbg_print_safe(runtime_core, "Kernel : Main thread ready\n");
 …
                 runner{ &this };
                 __cfaabi_dbg_print_safe("Kernel : constructed main processor context %p\n", &runner);
+                __cfadbg_print_safe(runtime_core, "Kernel : constructed main processor context %p\n", &runner);
+        }
 …
         // THE SYSTEM IS NOW COMPLETELY RUNNING
+        __cfaabi_dbg_print_safe("Kernel : Started\n--------------------------------------------------\n\n");
+        // Now that the system is up, finish creating systems that need threading
+        __kernel_io_finish_start( *mainCluster );
+        __cfadbg_print_safe(runtime_core, "Kernel : Started\n--------------------------------------------------\n\n");
         verify( ! kernelTLS.preemption_state.enabled );
 …
 static void __kernel_shutdown(void) {
+        __cfaabi_dbg_print_safe("\n--------------------------------------------------\nKernel : Shutting down\n");
+        //Before we start shutting things down, wait for systems that need threading to shutdown
+        __kernel_io_prepare_stop( *mainCluster );
         /* paranoid */ verify( TL_GET( preemption_state.enabled ) );
         disable_interrupts();
         /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+        __cfadbg_print_safe(runtime_core, "\n--------------------------------------------------\nKernel : Shutting down\n");
         // SKULLDUGGERY: Notify the mainProcessor it needs to terminates.
 …
         // Destroy the main processor and its context in reverse order of construction
         // These were manually constructed so we need manually destroy them
+        void ^?{}(processor & this) with( this ){
+                /* paranoid */ verify( this.do_terminate == true );
+        }
         ^(*mainProcessor){};
 …
         ^(*mainThread){};
+        ^(*mainCluster){};
         ^(__cfa_dbg_global_clusters.list){};
         ^(__cfa_dbg_global_clusters.lock){};
         __cfaabi_dbg_print_safe("Kernel : Shutdown complete\n");
+        __cfadbg_print_safe(runtime_core, "Kernel : Shutdown complete\n");
+}
 …
         // We are ready to sleep
         __cfaabi_dbg_print_safe("Kernel : Processor %p ready to sleep\n", this);
+        __cfadbg_print_safe(runtime_core, "Kernel : Processor %p ready to sleep\n", this);
         wait( idle );
         // We have woken up
         __cfaabi_dbg_print_safe("Kernel : Processor %p woke up and ready to run\n", this);
+        __cfadbg_print_safe(runtime_core, "Kernel : Processor %p woke up and ready to run\n", this);
         // Get ourself off the idle list
 …
 static bool __wake_one(cluster * this, __attribute__((unused)) bool force) {
         // if we don't want to force check if we know it's false
         if( !this->idles.head && !force ) return false;
+        // if( !this->idles.head && !force ) return false;
         // First, lock the cluster idle
 …
         // Wake them up
+        __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this->idles.head);
+        /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
         post( this->idles.head->idle );
 …
 // Unconditionnaly wake a thread
 static bool __wake_proc(processor * this) {
+        return post( this->idle );
+        __cfadbg_print_safe(runtime_core, "Kernel : waking Processor %p\n", this);
+        disable_interrupts();
+                /* paranoid */ verify( ! kernelTLS.preemption_state.enabled );
+                bool ret = post( this->idle );
+        enable_interrupts( __cfaabi_dbg_ctx );
+        return ret;
+}
 …
 void ^?{}(semaphore & this) {}
 void P(semaphore & this) with( this ){
+bool P(semaphore & this) with( this ){
         lock( lock __cfaabi_dbg_ctx2 );
         count -= 1;
 …
                 unlock( lock );
                 park( __cfaabi_dbg_ctx );
+                return true;
+        }
         else {
             unlock( lock );
+            return false;
+        }
+}
 …
         // make new owner
         unpark( thrd __cfaabi_dbg_ctx2 );
+        return thrd != 0p;
+}
+bool V(semaphore & this, unsigned diff) with( this ) {
+        $thread * thrd = 0p;
+        lock( lock __cfaabi_dbg_ctx2 );
+        int release = max(-count, (int)diff);
+        count += diff;
+        for(release) {
+                unpark( pop_head( waiting ) __cfaabi_dbg_ctx2 );
+        }
+        unlock( lock );
         return thrd != 0p;

libcfa/src/concurrency/kernel.hfa

-              re3bc51c
+              rbcd74f3
 #include <stdbool.h>
+#include <stdint.h>
 #include "invoke.h"
 …
 void  ?{}(semaphore & this, int count = 1);
 void ^?{}(semaphore & this);
 void   P (semaphore & this);
+bool   P (semaphore & this);
 bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);
 …
 //-----------------------------------------------------------------------------
+// I/O
+struct __io_data;
+#define CFA_CLUSTER_IO_POLLER_USER_THREAD 1 << 0
+// #define CFA_CLUSTER_IO_POLLER_KERNEL_SIDE 1 << 1
+//-----------------------------------------------------------------------------
 // Cluster
 struct cluster {
 …
                 cluster * prev;
         } node;
+        struct __io_data * io;
+        #if !defined(__CFA_NO_STATISTICS__)
+                bool print_stats;
+        #endif
 };
 extern Duration default_preemption();
 void ?{} (cluster & this, const char name[], Duration preemption_rate);
+void ?{} (cluster & this, const char name[], Duration preemption_rate, int flags);
 void ^?{}(cluster & this);
+static inline void ?{} (cluster & this)                           { this{"Anonymous Cluster", default_preemption()}; }
+static inline void ?{} (cluster & this, Duration preemption_rate) { this{"Anonymous Cluster", preemption_rate}; }
+static inline void ?{} (cluster & this, const char name[])        { this{name, default_preemption()}; }
+static inline void ?{} (cluster & this)                                      { this{"Anonymous Cluster", default_preemption(), 0}; }
+static inline void ?{} (cluster & this, Duration preemption_rate)            { this{"Anonymous Cluster", preemption_rate, 0}; }
+static inline void ?{} (cluster & this, const char name[])                   { this{name, default_preemption(), 0}; }
+static inline void ?{} (cluster & this, int flags)                           { this{"Anonymous Cluster", default_preemption(), flags}; }
+static inline void ?{} (cluster & this, Duration preemption_rate, int flags) { this{"Anonymous Cluster", preemption_rate, flags}; }
+static inline void ?{} (cluster & this, const char name[], int flags)        { this{name, default_preemption(), flags}; }
 static inline [cluster *&, cluster *& ] __get( cluster & this ) __attribute__((const)) { return this.node.[next, prev]; }
 …
 static inline struct cluster   * active_cluster  () { return TL_GET( this_processor )->cltr; }
+#if !defined(__CFA_NO_STATISTICS__)
+        static inline void print_stats_at_exit( cluster & this ) {
+                this.print_stats = true;
+        }
+#endif
 // Local Variables: //
 // mode: c //

libcfa/src/concurrency/kernel_private.hfa

-              re3bc51c
+              rbcd74f3
 extern volatile thread_local __cfa_kernel_preemption_state_t preemption_state __attribute__ ((tls_model ( "initial-exec" )));
+extern cluster * mainCluster;
 //-----------------------------------------------------------------------------
 // Threads
 …
         extern void __cfaabi_dbg_thread_unregister( $thread * thrd );
+)
+// KERNEL ONLY unpark with out disabling interrupts
+void __unpark( $thread * thrd __cfaabi_dbg_ctx_param2 );
+//-----------------------------------------------------------------------------
+// I/O
+void __kernel_io_startup     ( cluster &, int, bool );
+void __kernel_io_finish_start( cluster & );
+void __kernel_io_prepare_stop( cluster & );
+void __kernel_io_shutdown    ( cluster &, bool );
 //-----------------------------------------------------------------------------

libcfa/src/concurrency/preemption.cfa

-              re3bc51c
+              rbcd74f3
 // FwdDeclarations : Signal handlers
 static void sigHandler_ctxSwitch( __CFA_SIGPARMS__ );
+static void sigHandler_alarm    ( __CFA_SIGPARMS__ );
 static void sigHandler_segv     ( __CFA_SIGPARMS__ );
 static void sigHandler_ill      ( __CFA_SIGPARMS__ );
 …
 // Get next expired node
 static inline alarm_node_t * get_expired( alarm_list_t * alarms, Time currtime ) {
         if( !alarms->head ) return 0p;                                          // If no alarms return null
         if( alarms->head->alarm >= currtime ) return 0p;        // If alarms head not expired return null
+        if( ! & (*alarms)`first ) return 0p;                                            // If no alarms return null
+        if( (*alarms)`first.alarm >= currtime ) return 0p;      // If alarms head not expired return null
         return pop(alarms);                                                                     // Otherwise just pop head
+}
 …
         while( node = get_expired( alarms, currtime ) ) {
                 // __cfaabi_dbg_print_buffer_decl( " KERNEL: preemption tick.\n" );
+                Duration period = node->period;
+                if( period == 0) {
+                        node->set = false;                  // Node is one-shot, just mark it as not pending
+                }
                 // Check if this is a kernel
 …
                 // Check if this is a periodic alarm
-                Duration period = node->period;
                 if( period > 0 ) {
                         // __cfaabi_dbg_print_buffer_local( " KERNEL: alarm period is %lu.\n", period.tv );
 …
                         insert( alarms, node );             // Reinsert the node for the next time it triggers
+                }
-                else {
-                        node->set = false;                  // Node is one-shot, just mark it as not pending
+                }
+        }
         // If there are still alarms pending, reset the timer
         if( alarms->head ) {
+        if( & (*alarms)`first ) {
                 __cfaabi_dbg_print_buffer_decl( " KERNEL: @%ju(%ju) resetting alarm to %ju.\n", currtime.tv, __kernel_get_time().tv, (alarms->head->alarm - currtime).tv);
                 Duration delta = alarms->head->alarm - currtime;
                 Duration caped = max(delta, 50`us);
+                Duration delta = (*alarms)`first.alarm - currtime;
+                Duration capped = max(delta, 50`us);
                 // itimerval tim  = { caped };
                 // __cfaabi_dbg_print_buffer_local( "    Values are %lu, %lu, %lu %lu.\n", delta.tv, caped.tv, tim.it_value.tv_sec, tim.it_value.tv_usec);
                 __kernel_set_timer( caped );
+                __kernel_set_timer( capped );
+        }
+}
 …
         if ( pthread_sigmask( SIG_BLOCK, &mask, 0p ) == -1 ) {
             abort( "internal error, pthread_sigmask" );
+                abort( "internal error, pthread_sigmask" );
+        }
+}
 …
 // reserved for future use
 static void timeout( $thread * this ) {
         //TODO : implement waking threads
+        __unpark( this __cfaabi_dbg_ctx2 );
+}
 …
         // Setup proper signal handlers
         __cfaabi_sigaction( SIGUSR1, sigHandler_ctxSwitch, SA_SIGINFO | SA_RESTART ); // __cfactx_switch handler
+        __cfaabi_sigaction( SIGALRM, sigHandler_alarm    , SA_SIGINFO | SA_RESTART ); // debug handler
         signal_block( SIGALRM );
 …
         force_yield( __ALARM_PREEMPTION ); // Do the actual __cfactx_switch
+}
+static void sigHandler_alarm( __CFA_SIGPARMS__ ) {
+        abort("SIGALRM should never reach the signal handler");
+}

libcfa/src/concurrency/thread.hfa

-              re3bc51c
+              rbcd74f3
+}
+//----------
+// sleep: force thread to block and be rescheduled after Duration duration
+void sleep( Duration duration );
 // Local Variables: //
 // mode: c //

libcfa/src/exception.c

-              re3bc51c
+              rbcd74f3
 // Created On       : Mon Jun 26 15:13:00 2017
 // Last Modified By : Andrew Beach
 // Last Modified On : Fri Apr 03 11:57:00 2020
 // Update Count     : 14
+// Last Modified On : Tue Apr 14 12:01:00 2020
+// Update Count     : 18
 //
 …
 #include <unwind.h>
 #include <bits/debug.hfa>
+#include "stdhdr/assert.h"
 // FIX ME: temporary hack to keep ARM build working
 …
 // RESUMPTION ================================================================
+static void reset_top_resume(struct __cfaehm_try_resume_node ** store) {
+        this_exception_context()->top_resume = *store;
+}
 void __cfaehm_throw_resume(exception_t * except) {
         struct exception_context_t * context = this_exception_context();
+        __cfaabi_dbg_print_safe("Throwing resumption exception\n");
+        __cfadbg_print_safe(exception, "Throwing resumption exception\n");
+        __attribute__((cleanup(reset_top_resume)))
         struct __cfaehm_try_resume_node * original_head = context->top_resume;
         struct __cfaehm_try_resume_node * current = context->top_resume;
 …
                 context->top_resume = current->next;
                 if (current->handler(except)) {
-                        context->top_resume = original_head;
                         return;
+                }
+        }
+        __cfaabi_dbg_print_safe("Unhandled exception\n");
+        context->top_resume = original_head;
+        __cfadbg_print_safe(exception, "Unhandled exception\n");
         // Fall back to termination:
 …
         struct exception_context_t * context = this_exception_context();
         __cfaabi_dbg_print_safe("Deleting Exception\n");
+        __cfadbg_print_safe(exception, "Deleting Exception\n");
         // Remove the exception from the list.
 …
                 struct _Unwind_Context * unwind_context,
                 void * stop_param) {
+        if ( actions & _UA_END_OF_STACK  ) exit(1);
+        if ( actions & _UA_CLEANUP_PHASE ) return _URC_NO_REASON;
+        return _URC_FATAL_PHASE2_ERROR;
+        // Verify actions follow the rules we expect.
+        verify((actions & _UA_CLEANUP_PHASE) && (actions & _UA_FORCE_UNWIND));
+        verify(!(actions & (_UA_SEARCH_PHASE | _UA_HANDER_FRAME)));
+        if ( actions & _UA_END_OF_STACK ) {
+                exit(1);
+        } else {
+                return _URC_NO_REASON;
+        }
+}
 …
 void __cfaehm_throw_terminate( exception_t * val ) {
         __cfaabi_dbg_print_safe("Throwing termination exception\n");
+        __cfadbg_print_safe(exception, "Throwing termination exception\n");
         __cfaehm_allocate_exception( val );
 …
 void __cfaehm_rethrow_terminate(void) {
         __cfaabi_dbg_print_safe("Rethrowing termination exception\n");
+        __cfadbg_print_safe(exception, "Rethrowing termination exception\n");
         __cfaehm_begin_unwind();
 …
+{
         //__cfaabi_dbg_print_safe("CFA: 0x%lx\n", _Unwind_GetCFA(context));
         __cfaabi_dbg_print_safe("Personality function (%d, %x, %llu, %p, %p):",
+        //__cfadbg_print_safe(exception, "CFA: 0x%lx\n", _Unwind_GetCFA(context));
+        __cfadbg_print_safe(exception, "Personality function (%d, %x, %llu, %p, %p):",
                         version, actions, exception_class, unwind_exception, unwind_context);
+        // If we've reached the end of the stack then there is nothing much we can do...
+        if (actions & _UA_END_OF_STACK) return _URC_END_OF_STACK;
+        // Verify that actions follow the rules we expect.
+        // This function should never be called at the end of the stack.
+        verify(!(actions & _UA_END_OF_STACK));
+        // Either only the search phase flag is set or...
         if (actions & _UA_SEARCH_PHASE) {
+                __cfaabi_dbg_print_safe(" lookup phase");
+        }
+        else if (actions & _UA_CLEANUP_PHASE) {
+                __cfaabi_dbg_print_safe(" cleanup phase");
+        }
+        // Just in case, probably can't actually happen
+        else {
+                printf(" error\n");
+                return _URC_FATAL_PHASE1_ERROR;
+                verify(actions == _UA_SEARCH_PHASE);
+                __cfadbg_print_safe(exception, " lookup phase");
+        // ... we are in clean-up phase.
+        } else {
+                verify(actions & _UA_CLEANUP_PHASE);
+                __cfadbg_print_safe(exception, " cleanup phase");
+                // We shouldn't be the handler frame during forced unwind.
+                if (actions & _UA_HANDLER_FRAME) {
+                        verify(!(actions & _UA_FORCE_UNWIND));
+                        __cfadbg_print_safe(exception, " (handler frame)");
+                } else if (actions & _UA_FORCE_UNWIND) {
+                        __cfadbg_print_safe(exception, " (force unwind)");
+                }
+        }
 …
                         void * ep = (void*)lsd_info.Start + callsite_start + callsite_len;
                         void * ip = (void*)instruction_ptr;
                         __cfaabi_dbg_print_safe("\nfound %p - %p (%p, %p, %p), looking for %p\n",
+                        __cfadbg_print_safe(exception, "\nfound %p - %p (%p, %p, %p), looking for %p\n",
                                         bp, ep, ls, cs, cl, ip);
 #endif // __CFA_DEBUG_PRINT__
 …
                 if ( 0 == callsite_landing_pad ) {
                         // Nothing to do, move along
                         __cfaabi_dbg_print_safe(" no landing pad");
+                        __cfadbg_print_safe(exception, " no landing pad");
                 } else if (actions & _UA_SEARCH_PHASE) {
                         // In search phase, these means we found a potential handler we must check.
 …
                                 // Based on the return value, check if we matched the exception
                                 if (ret == _URC_HANDLER_FOUND) {
                                         __cfaabi_dbg_print_safe(" handler found\n");
+                                        __cfadbg_print_safe(exception, " handler found\n");
                                 } else {
                                         __cfaabi_dbg_print_safe(" no handler\n");
+                                        __cfadbg_print_safe(exception, " no handler\n");
+                                }
                                 return ret;
 …
                         // This is only a cleanup handler, ignore it
                         __cfaabi_dbg_print_safe(" no action");
                 } else if (actions & _UA_CLEANUP_PHASE) {
+                        __cfadbg_print_safe(exception, " no action");
+                } else {
                         // In clean-up phase, no destructors here but this could be the handler.
 …
                         _Unwind_SetIP( unwind_context, ((lsd_info.LPStart) + (callsite_landing_pad)) );
                         __cfaabi_dbg_print_safe(" action\n");
+                        __cfadbg_print_safe(exception, " action\n");
                         // Return have some action to run
 …
+        }
         // No handling found
         __cfaabi_dbg_print_safe(" table end reached\n");
+        __cfadbg_print_safe(exception, " table end reached");
         UNWIND:
         __cfaabi_dbg_print_safe(" unwind\n");
+        __cfadbg_print_safe(exception, " unwind\n");
         // Keep unwinding the stack
 …
 #pragma GCC push_options
 #pragma GCC optimize("O0")
+#pragma GCC optimize(0)
 // Try statements are hoisted out see comments for details. While this could probably be unique
 …
         "       .quad __gcfa_personality_v0\n"
 #else // then __i386
         "   .long __gcfa_personality_v0\n"
+        "       .long __gcfa_personality_v0\n"
 #endif
 );

libcfa/src/heap.cfa

-              re3bc51c
+              rbcd74f3
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Apr  1 15:59:53 2020
 // Update Count     : 692
+// Last Modified On : Wed May  6 17:29:26 2020
+// Update Count     : 727
 //
 …
 #include <errno.h>                                                                              // errno
 #include <string.h>                                                                             // memset, memcpy
+#include <limits.h>                                                                             // ULONG_MAX
 extern "C" {
 #include <sys/mman.h>                                                                   // mmap, munmap
 } // extern "C"
-// #comment TD : Many of these should be merged into math I believe
 #include "bits/align.hfa"                                                               // libPow2
 #include "bits/defs.hfa"                                                                // likely, unlikely
 …
 //#include "stdlib.hfa"                                                                 // bsearchl
 #include "malloc.h"
+#include "bitmanip.hfa"                                                                 // ceiling
 #define MIN(x, y) (y > x ? x : y)
 …
 };
+size_t default_heap_expansion() __attribute__(( weak )) {
+        return __CFA_DEFAULT_HEAP_EXPANSION__;
+} // default_heap_expansion
 size_t default_mmap_start() __attribute__(( weak )) {
         return __CFA_DEFAULT_MMAP_START__;
 } // default_mmap_start
-size_t default_heap_expansion() __attribute__(( weak )) {
-        return __CFA_DEFAULT_HEAP_EXPANSION__;
-} // default_heap_expansion
 …
                                 } fake; // FakeHeader
                         } kind; // Kind
                         uint32_t dimension;                                                     // used by calloc-like to remember number of array elements
+                        size_t size;                                                            // allocation size in bytes
                 } header; // Header
                 char pad[libAlign() - sizeof( Header )];
 …
 static unsigned long long int free_storage;
 static unsigned int free_calls;
+static unsigned long long int aalloc_storage;
+static unsigned int aalloc_calls;
 static unsigned long long int calloc_storage;
 static unsigned int calloc_calls;
 static unsigned long long int memalign_storage;
 static unsigned int memalign_calls;
+static unsigned long long int amemalign_storage;
+static unsigned int amemalign_calls;
 static unsigned long long int cmemalign_storage;
 static unsigned int cmemalign_calls;
 …
 // Use "write" because streams may be shutdown when calls are made.
 static void printStats() {
         char helpText[512];
+        char helpText[1024];
         __cfaabi_bits_print_buffer( STDERR_FILENO, helpText, sizeof(helpText),
                                                                         "\nHeap statistics:\n"
                                                                         "  malloc: calls %u / storage %llu\n"
+                                                                        "  aalloc: calls %u / storage %llu\n"
                                                                         "  calloc: calls %u / storage %llu\n"
                                                                         "  memalign: calls %u / storage %llu\n"
+                                                                        "  amemalign: calls %u / storage %llu\n"
                                                                         "  cmemalign: calls %u / storage %llu\n"
                                                                         "  resize: calls %u / storage %llu\n"
 …
                                                                         "  sbrk: calls %u / storage %llu\n",
                                                                         malloc_calls, malloc_storage,
+                                                                        aalloc_calls, calloc_storage,
                                                                         calloc_calls, calloc_storage,
                                                                         memalign_calls, memalign_storage,
+                                                                        amemalign_calls, amemalign_storage,
                                                                         cmemalign_calls, cmemalign_storage,
                                                                         resize_calls, resize_storage,
 …
 static int printStatsXML( FILE * stream ) {                             // see malloc_info
         char helpText[512];
+        char helpText[1024];
         int len = snprintf( helpText, sizeof(helpText),
                                                 "<malloc version=\"1\">\n"
 …
                                                 "</sizes>\n"
                                                 "<total type=\"malloc\" count=\"%u\" size=\"%llu\"/>\n"
+                                                "<total type=\"aalloc\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"calloc\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"memalign\" count=\"%u\" size=\"%llu\"/>\n"
+                                                "<total type=\"amemalign\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"cmemalign\" count=\"%u\" size=\"%llu\"/>\n"
                                                 "<total type=\"resize\" count=\"%u\" size=\"%llu\"/>\n"
 …
                                                 "</malloc>",
                                                 malloc_calls, malloc_storage,
+                                                aalloc_calls, aalloc_storage,
                                                 calloc_calls, calloc_storage,
                                                 memalign_calls, memalign_storage,
+                                                amemalign_calls, amemalign_storage,
                                                 cmemalign_calls, cmemalign_storage,
                                                 resize_calls, resize_storage,
 …
-static inline bool setHeapExpand( size_t value ) {
-  if ( heapExpand < pageSize ) return true;
-        heapExpand = value;
-        return false;
-} // setHeapExpand
 // thunk problem
 size_t Bsearchl( unsigned int key, const unsigned int * vals, size_t dim ) {
 …
 static inline bool setMmapStart( size_t value ) {               // true => mmapped, false => sbrk
   if ( value < pageSize || bucketSizes[NoBucketSizes - 1] < value ) return true;
+  if ( value < pageSize || bucketSizes[NoBucketSizes - 1] < value ) return false;
         mmapStart = value;                                                                      // set global
 …
         assert( maxBucketsUsed < NoBucketSizes );                       // subscript failure ?
         assert( mmapStart <= bucketSizes[maxBucketsUsed] ); // search failure ?
         return false;
+        return true;
 } // setMmapStart
 …
         #ifdef __CFA_DEBUG__
         checkHeader( addr < heapBegin || header < (HeapManager.Storage.Header *)heapBegin, name, addr ); // bad low address ?
+        checkHeader( addr < heapBegin, name, addr );            // bad low address ?
         #endif // __CFA_DEBUG__
 …
         // along with the block and is a multiple of the alignment size.
   if ( unlikely( size > ~0ul - sizeof(HeapManager.Storage) ) ) return 0p;
+  if ( unlikely( size > ULONG_MAX - sizeof(HeapManager.Storage) ) ) return 0p;
         size_t tsize = size + sizeof(HeapManager.Storage);
         if ( likely( tsize < mmapStart ) ) {                            // small size => sbrk
 …
                 block->header.kind.real.home = freeElem;                // pointer back to free list of apropriate size
         } else {                                                                                        // large size => mmap
   if ( unlikely( size > ~0ul - pageSize ) ) return 0p;
+  if ( unlikely( size > ULONG_MAX - pageSize ) ) return 0p;
                 tsize = libCeiling( tsize, pageSize );                  // must be multiple of page size
                 #ifdef __STATISTICS__
 …
         } // if
+        block->header.size = size;                                                      // store allocation size
         void * addr = &(block->data);                                           // adjust off header to user bytes
 …
         #endif // FASTLOOKUP
         if ( setMmapStart( default_mmap_start() ) ) {
+        if ( ! setMmapStart( default_mmap_start() ) ) {
                 abort( "HeapManager : internal error, mmap start initialization failure." );
         } // if
 …
         char * end = (char *)sbrk( 0 );
+        sbrk( (char *)libCeiling( (long unsigned int)end, libAlign() ) - end ); // move start of heap to multiple of alignment
+        heapBegin = heapEnd = sbrk( 0 );                                        // get new start point
+        heapBegin = heapEnd = sbrk( (char *)libCeiling( (long unsigned int)end, libAlign() ) - end ); // move start of heap to multiple of alignment
 } // HeapManager
 …
         //assert( heapManager.heapBegin != 0 );
         //heapManager{};
         if ( heapManager.heapBegin == 0p ) heapManager{};
+        if ( heapManager.heapBegin == 0p ) heapManager{};       // sanity check
 } // memory_startup
 …
         //assert( heapManager.heapBegin != 0 );
         if ( unlikely( heapManager.heapBegin == 0p ) ) heapManager{}; // called before memory_startup ?
+#if __SIZEOF_POINTER__ == 8
+        verify( size < ((typeof(size_t))1 << 48) );
+#endif // __SIZEOF_POINTER__ == 8
         void * addr = doMalloc( size );
         if ( unlikely( addr == 0p ) ) errno = ENOMEM;           // POSIX
 …
 static inline void * callocNoStats( size_t noOfElems, size_t elemSize ) {
         size_t size = noOfElems * elemSize;
+static inline void * callocNoStats( size_t dim, size_t elemSize ) {
+        size_t size = dim * elemSize;
         char * addr = (char *)mallocNoStats( size );
   if ( unlikely( addr == 0p ) ) return 0p;
 …
                 memset( addr, '\0', bsize - sizeof(HeapManager.Storage) ); // set to zeros
-        assert( noOfElems <= UINT32_MAX );
-        header->dimension = noOfElems;                                          // store number of array elements
         header->kind.real.blockSize |= 2;                                       // mark as zero filled
         return addr;
 …
 static inline void * cmemalignNoStats( size_t alignment, size_t noOfElems, size_t elemSize ) {
         size_t size = noOfElems * elemSize;
+static inline void * cmemalignNoStats( size_t alignment, size_t dim, size_t elemSize ) {
+        size_t size = dim * elemSize;
         char * addr = (char *)memalignNoStats( alignment, size );
   if ( unlikely( addr == 0p ) ) return 0p;
 …
                 memset( addr, '\0', dataStorage( bsize, addr, header ) ); // set to zeros
-        assert( noOfElems <= UINT32_MAX );
-        header->dimension = noOfElems;                                          // store initial array size
         header->kind.real.blockSize |= 2;                                       // mark as zero filled
         return addr;
 …
 extern "C" {
         // Allocates size bytes and returns a pointer to the allocated memory. The memory is not initialized. If size is 0,
         // then malloc() returns either 0p, or a unique pointer value that can later be successfully passed to free().
+        // Allocates size bytes and returns a pointer to the allocated memory.  The contents are undefined. If size is 0,
+        // then malloc() returns a unique pointer value that can later be successfully passed to free().
         void * malloc( size_t size ) {
                 #ifdef __STATISTICS__
 …
         } // malloc
+        // Allocate memory for an array of nmemb elements of size bytes each and returns a pointer to the allocated
+        // memory. The memory is set to zero. If nmemb or size is 0, then calloc() returns either 0p, or a unique pointer
+        // value that can later be successfully passed to free().
+        void * calloc( size_t noOfElems, size_t elemSize ) {
+        // Same as malloc() except size bytes is an array of dim elements each of elemSize bytes.
+        void * aalloc( size_t dim, size_t elemSize ) {
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
+                __atomic_add_fetch( &aalloc_storage, dim * elemSize, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return mallocNoStats( dim * elemSize );
+        } // aalloc
+        // Same as aalloc() with memory set to zero.
+        void * calloc( size_t dim, size_t elemSize ) {
                 #ifdef __STATISTICS__
                 __atomic_add_fetch( &calloc_calls, 1, __ATOMIC_SEQ_CST );
                 __atomic_add_fetch( &calloc_storage, noOfElems * elemSize, __ATOMIC_SEQ_CST );
                 #endif // __STATISTICS__
                 return callocNoStats( noOfElems, elemSize );
+                __atomic_add_fetch( &calloc_storage, dim * elemSize, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return callocNoStats( dim, elemSize );
         } // calloc
+        // Change the size of the memory block pointed to by ptr to size bytes. The contents are undefined.  If ptr is 0p,
+        // then the call is equivalent to malloc(size), for all values of size; if size is equal to zero, and ptr is not 0p,
+        // then the call is equivalent to free(ptr). Unless ptr is 0p, it must have been returned by an earlier call to
+        // malloc(), calloc() or realloc(). If the area pointed to was moved, a free(ptr) is done.
+        // Change the size of the memory block pointed to by oaddr to size bytes. The contents are undefined.  If oaddr is
+        // 0p, then the call is equivalent to malloc(size), for all values of size; if size is equal to zero, and oaddr is
+        // not 0p, then the call is equivalent to free(oaddr). Unless oaddr is 0p, it must have been returned by an earlier
+        // call to malloc(), alloc(), calloc() or realloc(). If the area pointed to was moved, a free(oaddr) is done.
         void * resize( void * oaddr, size_t size ) {
                 #ifdef __STATISTICS__
 …
                 size_t bsize, oalign = 0;
                 headers( "resize", oaddr, header, freeElem, bsize, oalign );
                 size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
                 // same size, DO NOT preserve STICKY PROPERTIES.
                 if ( oalign == 0 && size <= odsize && odsize <= size * 2 ) { // allow 50% wasted storage for smaller size
+          if ( oalign == 0 && size <= odsize && odsize <= size * 2 ) { // allow 50% wasted storage for smaller size
                         header->kind.real.blockSize &= -2;                      // no alignment and turn off 0 fill
                         return oaddr;
 …
                 // change size, DO NOT preserve STICKY PROPERTIES.
+                free( oaddr );
                 void * naddr = mallocNoStats( size );                   // create new area
-                free( oaddr );
                 return naddr;
         } // resize
         // Same as resize but the contents shall be unchanged in the range from the start of the region up to the minimum of
+        // Same as resize() but the contents are unchanged in the range from the start of the region up to the minimum of
         // the old and new sizes.
         void * realloc( void * oaddr, size_t size ) {
 …
         } // realloc
+        // Allocates size bytes and returns a pointer to the allocated memory. The memory address shall be a multiple of
+        // alignment, which must be a power of two. (obsolete)
+        // Same as malloc() except the memory address is a multiple of alignment, which must be a power of two. (obsolete)
         void * memalign( size_t alignment, size_t size ) {
                 #ifdef __STATISTICS__
 …
+        // Same as aalloc() with memory alignment.
+        void * amemalign( size_t alignment, size_t dim, size_t elemSize ) {
+                #ifdef __STATISTICS__
+                __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
+                __atomic_add_fetch( &cmemalign_storage, dim * elemSize, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return memalignNoStats( alignment, dim * elemSize );
+        } // amemalign
         // Same as calloc() with memory alignment.
         void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize ) {
+        void * cmemalign( size_t alignment, size_t dim, size_t elemSize ) {
                 #ifdef __STATISTICS__
                 __atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
                 __atomic_add_fetch( &cmemalign_storage, noOfElems * elemSize, __ATOMIC_SEQ_CST );
                 #endif // __STATISTICS__
                 return cmemalignNoStats( alignment, noOfElems, elemSize );
+                __atomic_add_fetch( &cmemalign_storage, dim * elemSize, __ATOMIC_SEQ_CST );
+                #endif // __STATISTICS__
+                return cmemalignNoStats( alignment, dim, elemSize );
         } // cmemalign
 …
         // Frees the memory space pointed to by ptr, which must have been returned by a previous call to malloc(), calloc()
         // or realloc().  Otherwise, or if free(ptr) has already been called before, undefined behavior occurs. If ptr is
+        // or realloc().  Otherwise, or if free(ptr) has already been called before, undefined behaviour occurs. If ptr is
         // 0p, no operation is performed.
         void free( void * addr ) {
 …
         // Returns the alignment of the allocation.
+        // Returns the alignment of an allocation.
         size_t malloc_alignment( void * addr ) {
           if ( unlikely( addr == 0p ) ) return libAlign();      // minimum alignment
 …
         } // malloc_alignment
+        // Returns true if the allocation is zero filled, i.e., initially allocated by calloc().
+        // Set the alignment for an the allocation and return previous alignment or 0 if no alignment.
+        size_t $malloc_alignment_set( void * addr, size_t alignment ) {
+          if ( unlikely( addr == 0p ) ) return libAlign();      // minimum alignment
+                size_t ret;
+                HeapManager.Storage.Header * header = headerAddr( addr );
+                if ( (header->kind.fake.alignment & 1) == 1 ) { // fake header ?
+                        ret = header->kind.fake.alignment & -2;         // remove flag from old value
+                        header->kind.fake.alignment = alignment | 1; // add flag to new value
+                } else {
+                        ret = 0;                                                                        // => no alignment to change
+                } // if
+                return ret;
+        } // $malloc_alignment_set
+        // Returns true if the allocation is zero filled, e.g., allocated by calloc().
         bool malloc_zero_fill( void * addr ) {
           if ( unlikely( addr == 0p ) ) return false;           // null allocation is not zero fill
 …
                         header = realHeader( header );                          // backup from fake to real header
                 } // if
                 return (header->kind.real.blockSize & 2) != 0;  // zero filled (calloc/cmemalign) ?
+                return (header->kind.real.blockSize & 2) != 0;  // zero filled ?
         } // malloc_zero_fill
+        // Returns number of elements if the allocation is for an array, i.e., by calloc().
+        size_t malloc_dimension( void * addr ) {
+        // Set allocation is zero filled and return previous zero filled.
+        bool $malloc_zero_fill_set( void * addr ) {
           if ( unlikely( addr == 0p ) ) return false;           // null allocation is not zero fill
                 HeapManager.Storage.Header * header = headerAddr( addr );
 …
                         header = realHeader( header );                          // backup from fake to real header
                 } // if
+                return header->dimension;                                               // array (calloc/cmemalign)
+        } // malloc_zero_fill
+                bool ret = (header->kind.real.blockSize & 2) != 0; // zero filled ?
+                header->kind.real.blockSize |= 2;                               // mark as zero filled
+                return ret;
+        } // $malloc_zero_fill_set
+        // Returns original total allocation size (not bucket size) => array size is dimension * sizeif(T).
+        size_t malloc_size( void * addr ) {
+          if ( unlikely( addr == 0p ) ) return false;           // null allocation is not zero fill
+                HeapManager.Storage.Header * header = headerAddr( addr );
+                if ( (header->kind.fake.alignment & 1) == 1 ) { // fake header ?
+                        header = realHeader( header );                          // backup from fake to real header
+                } // if
+                return header->size;
+        } // malloc_size
+        // Set allocation size and return previous size.
+        size_t $malloc_size_set( void * addr, size_t size ) {
+          if ( unlikely( addr == 0p ) ) return false;           // null allocation is not zero fill
+                HeapManager.Storage.Header * header = headerAddr( addr );
+                if ( (header->kind.fake.alignment & 1) == 1 ) { // fake header ?
+                        header = realHeader( header );                          // backup from fake to real header
+                } // if
+                size_t ret = header->size;
+                header->size = size;
+                return ret;
+        } // $malloc_size_set
 …
         // Adjusts parameters that control the behavior of the memory-allocation functions (see malloc). The param argument
+        // Adjusts parameters that control the behaviour of the memory-allocation functions (see malloc). The param argument
         // specifies the parameter to be modified, and value specifies the new value for that parameter.
         int mallopt( int option, int value ) {
                 choose( option ) {
                   case M_TOP_PAD:
                         if ( setHeapExpand( value ) ) return 1;
+                        heapExpand = ceiling( value, pageSize ); return 1;
                   case M_MMAP_THRESHOLD:
                         if ( setMmapStart( value ) ) return 1;
+                        break;
                 } // switch
                 return 0;                                                                               // error, unsupported

libcfa/src/interpose.cfa

re3bc51c	rbcd74f3
15	15
16	16	#include <stdarg.h> // va_start, va_end
	17	#include <stdio.h>
17	18	#include <string.h> // strlen
18	19	#include <unistd.h> // _exit, getpid

libcfa/src/iostream.cfa

-              re3bc51c
+              rbcd74f3
 // Created On       : Wed May 27 17:56:53 2015
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Mar 11 14:35:35 2020
 // Update Count     : 860
+// Last Modified On : Sat May  2 18:30:25 2020
+// Update Count     : 1017
 //
 …
 #include <complex.h>                                                                    // creal, cimag
 } // extern "C"
+#include <bitmanip.hfa>                                                                 // fms
 …
+\
                 if ( f.base == 'b' || f.base == 'B' ) {                 /* bespoke binary format */ \
+                        int bits;                                                                                                       \
+                        if ( f.val == (T){0} ) bits = 1;                        /* force at least one bit to print */ \
+                        else bits = sizeof(long long int) * 8 - __builtin_clzll( f.val ); /* position of most significant bit */ \
+                        bits = bits > sizeof(f.val) * 8 ? sizeof(f.val) * 8 : bits; \
+                        int spaces = f.wd - bits;                                       /* can be negative */ \
+                        if ( ! f.flags.nobsdp ) { spaces -= 2; }        /* base prefix takes space */ \
+                        /* printf( "%d %d\n", bits, spaces ); */ \
+                        int bits = high1( f.val );                                      /* position of most significant bit */ \
+                        if ( bits == 0 ) bits = 1;                                      /* 0 value => force one bit to print */ \
+                        int spaces; \
                         if ( ! f.flags.left ) {                                         /* right justified ? */ \
                                 /* Note, base prefix then zero padding or spacing then prefix. */ \
+                                if ( f.flags.pad0 || f.flags.pc ) { \
+                                if ( f.flags.pc ) { \
+                                        spaces = f.wd - f.pc; \
+                                        if ( ! f.flags.nobsdp ) { spaces -= 2; } /* base prefix takes space */ \
+                                        if ( spaces > 0 ) fmt( os, "%*s", spaces, " " ); /* space pad */ \
                                         if ( ! f.flags.nobsdp ) { fmt( os, "0%c", f.base ); } \
                                         if ( f.flags.pc ) spaces = f.pc - bits; \
+                                        spaces = f.pc - bits; \
                                         if ( spaces > 0 ) fmt( os, "%0*d", spaces, 0 ); /* zero pad */ \
                                 } else { \
+                                        if ( spaces > 0 ) fmt( os, "%*s", spaces, " " ); /* space pad */ \
+                                        if ( ! f.flags.nobsdp ) { fmt( os, "0%c", f.base ); } \
+                                        spaces = f.wd - bits; \
+                                        if ( ! f.flags.nobsdp ) { spaces -= 2; } /* base prefix takes space */ \
+                                        if ( f.flags.pad0 ) { \
+                                                if ( ! f.flags.nobsdp ) { fmt( os, "0%c", f.base ); } \
+                                                if ( spaces > 0 ) fmt( os, "%0*d", spaces, 0 ); /* zero pad */ \
+                                        } else { \
+                                                if ( spaces > 0 ) fmt( os, "%*s", spaces, " " ); /* space pad */ \
+                                                if ( ! f.flags.nobsdp ) { fmt( os, "0%c", f.base ); } \
+                                        } /* if */ \
                                 } /* if */ \
+                        } else if ( ! f.flags.nobsdp ) { \
+                                fmt( os, "0%c", f.base ); \
+                        } else { \
+                                if ( ! f.flags.nobsdp ) fmt( os, "0%c", f.base ); \
+                                if ( f.flags.pc ) { \
+                                        spaces = f.pc - bits; \
+                                        if ( spaces > 0 ) fmt( os, "%0*d", spaces, 0 ); /* zero pad */ \
+                                        spaces = f.wd - f.pc; \
+                                } else { /* pad0 flag ignored with left flag */ \
+                                        spaces = f.wd - bits; \
+                                } /* if */ \
+                                if ( ! f.flags.nobsdp ) { spaces -= 2; } /* base prefix takes space */ \
                         } /* if */ \
                         int shift = (bits - 1) / 4 * 4; /* floor( bits - 1, 4 ) */ \
+                        int shift = floor( bits - 1, 4 ); \
                         typeof( f.val ) temp = f.val; \
                         fmt( os, "%s", shortbin[(temp >> shift) & 0xf] ); \
 …
                                 fmt2.flags.pad0 = fmt2.flags.nobsdp = true;     \
                                 if ( f.base == 'b' | f.base == 'B' ) { \
+                                        if ( f.wd > 64 ) fmt.wd = f.wd - 64; \
+                                        if ( f.flags.pc && f.pc > 64 ) fmt.pc = f.pc - 64; \
+                                        fmt2.wd = 64; \
+                                        if ( fmt.flags.pc && fmt.pc > 64 ) fmt.pc -= 64; else { fmt.flags.pc = false; fmt.pc = 0; } \
+                                        if ( fmt.flags.left ) { \
+                                                fmt.flags.left = false; \
+                                                fmt.wd = 0; \
+                                                /* printf( "L %llo %llo %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all ); */ \
+                                                fmt2.flags.left = true; \
+                                                int msigd = high1( msig ); \
+                                                fmt2.wd = f.wd - (fmt.pc > msigd ? fmt.pc : msigd); \
+                                                if ( ! fmt.flags.nobsdp ) fmt2.wd -= 2; /* compensate for 0b base specifier */ \
+                                                if ( (int)fmt2.wd < 64 ) fmt2.wd = 64; /* cast deals with negative value */ \
+                                                fmt2.flags.pc = true; fmt2.pc = 64; \
+                                        } else { \
+                                                if ( fmt.wd > 64 ) fmt.wd -= 64; \
+                                                else fmt.wd = 1; \
+                                                /* printf( "R %llo %llo %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all ); */ \
+                                                fmt2.wd = 64; \
+                                        } /* if */ \
+                                        /* printf( "C %llo %d %d '%c' %x\n", fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
                                         (ostype &)(os | fmt | "" | fmt2); \
                                 } else if ( f.base == 'o' ) { \
+                                        if ( fmt.flags.pc && fmt.pc > 22 ) fmt.pc -= 22; else { fmt.flags.pc = false; fmt.pc = 0; } \
                                         fmt.val = (unsigned long long int)fmt.val >> 2; \
+                                        if ( f.wd > 21 ) fmt.wd = f.wd - 21; \
+                                        if ( f.flags.pc && f.pc > 21 ) fmt.pc = f.pc - 21; \
+                                        fmt2.wd = 1; \
+                                        fmt2.val = ((msig & 0x3) << 1) + 1; \
+                                        (ostype &)(os | fmt | "" | fmt2); \
+                                        sepOff( os ); \
+                                        fmt2.wd = 21; \
+                                        fmt2.val = lsig & 0x7fffffffffffffff; \
+                                        fmt2.val = ((msig & 0x3) << 1) + ((lsig & 0x8000000000000000U) != 0); \
+                                        if ( fmt.flags.left ) { \
+                                                fmt.flags.left = false; \
+                                                fmt.wd = 0; \
+                                                /* printf( "L %llo %llo %llo %d %d '%c' %x %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all, fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
+                                                (ostype &)(os | fmt | "" | fmt2); \
+                                                sepOff( os ); \
+                                                fmt2.flags.left = true; \
+                                                int msigd = ceiling( high1( fmt.val ), 3 ); \
+                                                fmt2.wd = f.wd - (fmt.pc > msigd ? fmt.pc : msigd); \
+                                                if ( ! fmt.flags.nobsdp ) fmt2.wd -= 1; /* compensate for 0 base specifier */ \
+                                                if ( (int)fmt2.wd < 21 ) fmt2.wd = 21; /* cast deals with negative value */ \
+                                                fmt2.flags.pc = true; fmt2.pc = 21; \
+                                        } else { \
+                                                if ( fmt.wd > 22 ) fmt.wd -= 22; \
+                                                else fmt.wd = 1; \
+                                                /* printf( "R %llo %llo %llo %d %d '%c' %x %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all, fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
+                                                (ostype &)(os | fmt | "" | fmt2); \
+                                                sepOff( os ); \
+                                                fmt2.wd = 21; \
+                                        } /* if */ \
+                                        fmt2.val = lsig & 0x7fffffffffffffffU; \
+                                        /* printf( "\nC %llo %d %d '%c' %x\n", fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
                                         (ostype &)(os | fmt2); \
+                                } else { \
+                                        if ( f.flags.left ) { \
+                                                if ( f.wd > 16 ) fmt2.wd = f.wd - 16; \
+                                                fmt.wd = 16; \
+                                } else { /* f.base == 'x'  | f.base == 'X' */ \
+                                        if ( fmt.flags.pc && fmt.pc > 16 ) fmt.pc -= 16; else { fmt.flags.pc = false; fmt.pc = 0; } \
+                                        if ( fmt.flags.left ) { \
+                                                fmt.flags.left = false; \
+                                                fmt.wd = 0; \
+                                                /* printf( "L %llo %llo %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all ); */ \
+                                                fmt2.flags.left = true; \
+                                                int msigd = high1( msig ); \
+                                                fmt2.wd = f.wd - (fmt.pc > msigd ? fmt.pc : msigd); \
+                                                if ( ! fmt.flags.nobsdp ) fmt2.wd -= 2; /* compensate for 0x base specifier */ \
+                                                if ( (int)fmt2.wd < 16 ) fmt2.wd = 16; /* cast deals with negative value */ \
+                                                fmt2.flags.pc = true; fmt2.pc = 16; \
                                         } else { \
+                                                if ( f.wd > 16 ) fmt.wd = f.wd - 16; \
+                                                if ( f.flags.pc && f.pc > 16 ) fmt.pc = f.pc - 16; \
+                                                if ( fmt.wd > 16 ) fmt.wd -= 16; \
+                                                else fmt.wd = 1; \
+                                                /* printf( "R %llo %llo %llo %d %d '%c' %x\n", msig, lsig, fmt.val, fmt.wd, fmt.pc, fmt.base, fmt.all ); */ \
                                                 fmt2.wd = 16; \
                                         } /* if */ \
+                                        /* printf( "C %llo %d %d '%c' %x\n", fmt2.val, fmt2.wd, fmt2.pc, fmt2.base, fmt2.all ); */ \
                                         (ostype &)(os | fmt | "" | fmt2); \
                                 } /* if */ \

libcfa/src/startup.cfa

-              re3bc51c
+              rbcd74f3
 //
+#include <time.h>                                                                               // tzset
+#include <time.h>                // tzset
+#include <locale.h>        // setlocale
 #include "startup.hfa"
 …
     void __cfaabi_appready_startup( void ) {
                 tzset();                                                                                // initialize time global variables
+                setlocale(LC_NUMERIC, "");
                 #ifdef __CFA_DEBUG__
                 extern void heapAppStart();

libcfa/src/stdhdr/malloc.h

-              re3bc51c
+              rbcd74f3
 // Created On       : Thu Jul 20 15:58:16 2017
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Sun Mar  8 10:01:20 2020
 // Update Count     : 11
+// Last Modified On : Thu Apr 16 22:44:06 2020
+// Update Count     : 13
 //
 …
 extern "C" {
+void * aalloc( size_t noOfElems, size_t elemSize );
+void * amemalign( size_t alignment, size_t noOfElems, size_t elemSize );
 void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize );
 size_t malloc_alignment( void * );
 bool malloc_zero_fill( void * );
 size_t malloc_dimension( void * );
+size_t malloc_size( void * );
 int malloc_stats_fd( int fd );
 } // extern "C"

libcfa/src/stdlib.cfa

-              re3bc51c
+              rbcd74f3
 // Created On       : Thu Jan 28 17:10:29 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Tue Mar 31 13:26:46 2020
 // Update Count     : 495
+// Last Modified On : Thu Apr 16 22:43:33 2020
+// Update Count     : 498
 //
 …
         } // alloc_set
         T * alloc_set( T ptr[], size_t dim, T fill ) { // realloc array with fill
+        T * alloc_set( T ptr[], size_t dim, T fill ) {          // realloc array with fill
                 size_t olen = malloc_usable_size( ptr );                // current allocation
                 void * nptr = (void *)realloc( (void *)ptr, dim * sizeof(T) ); // C realloc
                 size_t nlen = malloc_usable_size( nptr );               // new allocation
                 if ( nlen > olen ) {                                                    // larger ?
+                        for ( i; dim ) { memcpy( &ptr[i], &fill, sizeof(T) ); } // initialize with fill value
+                        for ( i; malloc_size( ptr ) / sizeof(T) ~ dim ) {
+                                memcpy( &ptr[i], &fill, sizeof(T) );    // initialize with fill value
+                        } // for
                 } // if
                 return (T *)nptr;

libcfa/src/stdlib.hfa

-              re3bc51c
+              rbcd74f3
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Apr  1 18:38:41 2020
 // Update Count     : 429
+// Last Modified On : Thu Apr 16 22:44:05 2020
+// Update Count     : 432
 //
 …
         void * memalign( size_t align, size_t size );           // malloc.h
         size_t malloc_usable_size( void * ptr );                        // malloc.h
+        size_t malloc_size( void * addr );                                      // CFA heap
         void * cmemalign( size_t alignment, size_t noOfElems, size_t elemSize ); // CFA heap
         void * memset( void * dest, int fill, size_t size ); // string.h

src/CompilationState.cc

re3bc51c	rbcd74f3
27	27	nopreludep = false,
28	28	genproto = false,
	29	deterministic_output = false,
29	30	nomainp = false,
30	31	parsep = false,

src/CompilationState.h

re3bc51c	rbcd74f3
28	28	nopreludep,
29	29	genproto,
	30	deterministic_output,
30	31	nomainp,
31	32	parsep,

src/Parser/parser.yy

-              re3bc51c
+              rbcd74f3
 // Created On       : Sat Sep  1 20:22:55 2001
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Fri Mar  6 17:26:45 2020
 // Update Count     : 4474
+// Last Modified On : Mon Apr 27 12:25:42 2020
+// Update Count     : 4483
 //
 …
 tuple_expression_list:
+        assignment_expression_opt
+        | tuple_expression_list ',' assignment_expression_opt
+        assignment_expression
+        | '@'                                                                                           // CFA
+                { SemanticError( yylloc, "Eliding tuple element with '@' is currently unimplemented." ); $$ = nullptr; }
+        | tuple_expression_list ',' assignment_expression
                 { $$ = (ExpressionNode *)($1->set_last( $3 )); }
+        | tuple_expression_list ',' '@'
+                { SemanticError( yylloc, "Eliding tuple element with '@' is currently unimplemented." ); $$ = nullptr; }
+        ;

src/ResolvExpr/TypeEnvironment.cc

-              re3bc51c
+              rbcd74f3
 #include <utility>                     // for pair, move
+#include "CompilationState.h"          // for deterministic_output
 #include "Common/utility.h"            // for maybeClone
 #include "SynTree/Type.h"              // for Type, FunctionType, Type::Fora...
 …
         void EqvClass::print( std::ostream &os, Indenter indent ) const {
+                os << "( ";
+                std::copy( vars.begin(), vars.end(), std::ostream_iterator< std::string >( os, " " ) );
+                os << ")";
+                if( !deterministic_output ) {
+                        os << "( ";
+                        std::copy( vars.begin(), vars.end(), std::ostream_iterator< std::string >( os, " " ) );
+                        os << ")";
+                }
                 if ( type ) {
                         os << " -> ";
 …
                 // check safely bindable
                 if ( r.type && occursIn( r.type, s.vars.begin(), s.vars.end(), *this ) ) return false;
                 // merge classes in
                 r.vars.insert( s.vars.begin(), s.vars.end() );

src/main.cc

-              re3bc51c
+              rbcd74f3
 static const char optstring[] = ":c:ghlLmNnpP:S:twW:D:";
+static const char optstring[] = ":c:ghlLmNnpdP:S:twW:D:";
 enum { PreludeDir = 128 };
 …
         { "no-prelude", no_argument, nullptr, 'n' },
         { "prototypes", no_argument, nullptr, 'p' },
+        { "deterministic-out", no_argument, nullptr, 'd' },
         { "print", required_argument, nullptr, 'P' },
         { "prelude-dir", required_argument, nullptr, PreludeDir },
 …
         "do not read prelude",                                // -n
         "generate prototypes for prelude functions",            // -p
+        "don't print output that isn't deterministic",        // -d
         "print",                                              // -P
         "<directory> prelude directory for debug/nodebug",      // no flag
 …
                         genproto = true;
                         break;
+                  case 'd':                                     // don't print non-deterministic output
+                    deterministic_output = true;
+                        break;
                   case 'P':                                                                             // print options
                         for ( int i = 0;; i += 1 ) {

tests/.expect/alloc.txt

-              re3bc51c
+              rbcd74f3
 CFA realloc array alloc, fill
 xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede
-CFA realloc array alloc, 5
-xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede 0xdededede
-CFA realloc array alloc, 5
-xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef
-CFA realloc array alloc, 5
-xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0xdeadbeef 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0x1010101 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff 0xffffffff
 C   memalign 42 42.5

tests/Makefile.am

re3bc51c	rbcd74f3
41	41	-quiet @CFA_FLAGS@ \
42	42	-DIN_DIR="${abs_srcdir}/.in/"
	43
	44	AM_CFAFLAGS = -XCFA --deterministic-out
43	45
44	46	# get the desired cfa to test

tests/Makefile.in

re3bc51c	rbcd74f3
408	408	-DIN_DIR="${abs_srcdir}/.in/"
409	409
	410	AM_CFAFLAGS = -XCFA --deterministic-out
410	411
411	412	# get the desired cfa to test

tests/alloc.cfa

-              re3bc51c
+              rbcd74f3
 // Created On       : Wed Feb  3 07:56:22 2016
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Wed Apr  1 10:58:35 2020
 // Update Count     : 424
+// Last Modified On : Mon Apr  6 21:08:23 2020
+// Update Count     : 428
 //
 …
         ip = alloc_set( ip, 3 * dim, fill );                            // CFA realloc array alloc, fill
         printf( "CFA realloc array alloc, fill\n" );
         for ( i; 3 * dim ) { printf( "%#x ", ip[i] );; }
         printf( "\n" );
         // do not free
         ip = alloc_set( ip, 3 * dim, 5 );                                       // CFA realloc array alloc, 5
+        for ( i; 3 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+        // do not free
+#if 0 // FIX ME
+        ip = alloc_set( ip, 5 * dim, 5 );                                       // CFA realloc array alloc, 5
         printf( "CFA realloc array alloc, 5\n" );
         for ( i; 3 * dim ) { printf( "%#x ", ip[i] ); }
+        for ( i; 5 * dim ) { printf( "%#x ", ip[i] ); }
         printf( "\n" );
         // do not free
 …
         // do not free
         ip = alloc_set( ip, 3 * dim, 5 );                                       // CFA realloc array alloc, 5
+        ip = alloc_set( ip, 5 * dim, 5 );                                       // CFA realloc array alloc, 5
         printf( "CFA realloc array alloc, 5\n" );
         for ( i; 3 * dim ) { printf( "%#x ", ip[i] );; }
         printf( "\n" );
+        free( ip );
+        for ( i; 5 * dim ) { printf( "%#x ", ip[i] ); }
+        printf( "\n" );
+#endif // 0
+        free( ip );
         // resize, non-array types

tests/concurrent/.expect/monitor.txt

re3bc51c	rbcd74f3
1		4000000
	1	3000000

tests/concurrent/monitor.cfa

re3bc51c	rbcd74f3
29	29
30	30	void main( MyThread & this ) {
31		for(int i = 0; i < ~~1_00~~0_000; i++) {
	31	for(int i = 0; i < 750_000; i++) {
32	32	increment( global );
33	33	}

tests/errors/.expect/completeType.txt

-              re3bc51c
+              rbcd74f3
     void
+  )
   Environment:( _85_4_DT ) -> instance of struct A with body 0 (no widening)
+  Environment: -> instance of struct A with body 0 (no widening)
 …
     void
+  )
   Environment:( _85_4_DT ) -> instance of struct B with body 1 (no widening)
+  Environment: -> instance of struct B with body 1 (no widening)
 …
           void
+        )
         Environment:( _104_0_T ) -> instance of type T (not function type) (no widening)
+        Environment: -> instance of type T (not function type) (no widening)
       Could not satisfy assertion:

tests/exceptions/.expect/interact.txt

-              re3bc51c
+              rbcd74f3
 resumption catch, will terminate
 inner termination catch
+throwing resume moon
+resumption moon catch, will terminate
+termination catch
+throwing resume star
+resumption star catch

tests/exceptions/.expect/resume.txt

re3bc51c	rbcd74f3
25	25	caught second exception
26	26	recaught first exception
	27
	28	inner catch
	29	inner catch
	30	outer catch

tests/exceptions/.expect/terminate.txt

re3bc51c	rbcd74f3
24	24	caught second exception
25	25	recaught first exception
	26
	27	inner catch
	28	outer catch

tests/exceptions/conditional.cfa

-              re3bc51c
+              rbcd74f3
 // up the non-trivial exception is reasonable to do.
 #include "except-mac.hfa"
+#include <exception.hfa>
 #include <stdio.h>
+DECLARE_EXCEPT(num_error, BASE_EXCEPT,
+VTABLE_DECLARATION(num_error)(
         int (*code)(num_error *this);
 );
 …
     this.num = other.num;
+}
-void copy(num_error * this, num_error * other) {
-        *this = *other;
+}
 void ^?{}(num_error & this) {
     if( this.msg ) free( this.msg );
 …
+}
+VTABLE_INSTANCE(num_error, BASE_EXCEPT, copy, ^?{},
+        num_error_msg, num_error_code
+VTABLE_INSTANCE(num_error)(
+        num_error_msg,
+        num_error_code,
 );
 …
         try {
                 THROW(&exc);
+                throw &exc;
         } catch (num_error * error ; 3 == error->virtual_table->code( error )) {
                 caught_num_error(3, error);
 …
         try {
                 THROW_RESUME(&exc);
+                throwResume &exc;
         } catchResume (num_error * error ; 3 == error->virtual_table->code( error )) {
                 caught_num_error(3, error);

tests/exceptions/finally.cfa

-              re3bc51c
+              rbcd74f3
 // Finally Clause Tests
 #include "except-mac.hfa"
+#include <exception.hfa>
 #include "except-io.hfa"
 …
                 try {
                         printf("termination throw\n");
                         THROW(&exc);
+                        throw &exc;
                 } finally {
                         loud_exit a = "termination inner finally";
 …
                 try {
                         printf("resumption throw\n");
                         THROW_RESUME(&exc);
+                        throwResume &exc;
                 } finally {
                         loud_exit a = "resumption inner finally";

tests/exceptions/interact.cfa

-              re3bc51c
+              rbcd74f3
 // Testing Interactions Between Termination and Resumption
 #include "except-mac.hfa"
+#include <exception.hfa>
 #include "except-io.hfa"
 …
         // Resume falls back to terminate.
         try {
                 THROW_RESUME(&(star){});
+                throwResume &(star){};
         } catch (star *) {
                 printf("caught as termination\n");
 …
         try {
                 loud_region a = "try block with resume throw";
                 THROW_RESUME(&(star){});
+                throwResume &(star){};
         } catch (star *) {
                 printf("caught as termination\n");
 …
         try {
                 try {
                         THROW(&(star){});
+                        throw &(star){};
                 } catchResume (star *) {
                         printf("resume catch on terminate\n");
 …
         try {
                 try {
                         THROW_RESUME(&(star){});
+                        throwResume &(star){};
                 } catch (star *) {
                         printf("terminate catch on resume\n");
 …
                 try {
                         try {
                                 THROW(&(star){});
+                                throw &(star){};
                         } catchResume (star *) {
                                 printf("inner resume catch (error)\n");
 …
                 } catch (star * error) {
                         printf("termination catch, will resume\n");
                         THROW_RESUME(error);
+                        throwResume error;
+                }
         } catchResume (star *) {
 …
                 try {
                         try {
                                 THROW_RESUME(&(star){});
+                                throwResume &(star){};
                         } catch (star *) {
                                 printf("inner termination catch\n");
 …
                 } catchResume (star * error) {
                         printf("resumption catch, will terminate\n");
                         THROW(error);
+                        throw error;
+                }
         } catch (star *) {
                 printf("outer terminate catch (error)\n");
+        }
-#if 0
         printf("\n");
 …
                                 try {
                                         printf("throwing resume moon\n");
                                         THROW_RESUME(&(moon){});
+                                        throwResume &(moon){};
                                 } catch (star *) {
                                         printf("termination catch\n");
+                                }
                                 printf("throwing resume star\n");
                                 THROW_RESUME(&(star){});
+                                throwResume &(star){};
                         } catchResume (star *) {
                                 printf("resumption star catch\n");
 …
                 } catchResume (moon *) {
                         printf("resumption moon catch, will terminate\n");
                         THROW(&(star){});
+                        throw &(star){};
+                }
         } catchResume (star *) {
                 printf("outermost catch (error)\n");
+        }
-#endif
+}

tests/exceptions/resume.cfa

-              re3bc51c
+              rbcd74f3
 // Resumption Exception Tests
 #include "except-mac.hfa"
+#include <exception.hfa>
 #include "except-io.hfa"
 …
                 loud_exit a = "simple try clause";
                 printf("simple throw\n");
                 THROW_RESUME(&(zen){});
+                throwResume &(zen){};
                 printf("end of try clause\n");
         } catchResume (zen * error) {
 …
         try {
                 printf("throwing child exception\n");
                 THROW_RESUME(&(moment_of){});
+                throwResume &(moment_of){};
         } catchResume (zen *) {
                 printf("inner parent match\n");
 …
         try {
                 try {
                         THROW_RESUME(&(yin){});
+                        throwResume &(yin){};
                 } catchResume (zen *) {
                         printf("caught yin as zen\n");
 …
                         loud_exit a = "rethrow inner try";
                         printf("rethrow inner try\n");
                         THROW_RESUME(&(zen){});
+                        throwResume &(zen){};
                 } catchResume (zen *) {
                         loud_exit a = "rethrowing catch clause";
 …
         try {
                 try {
                         THROW_RESUME(&(yin){});
+                        throwResume &(yin){};
                 } catchResume (yin *) {
                         printf("caught yin, will throw yang\n");
                         THROW_RESUME(&(yang){});
+                        throwResume &(yang){};
                 } catchResume (yang *) {
                         printf("caught exception from same try\n");
 …
                 try {
                         printf("throwing first exception\n");
                         THROW_RESUME(&(yin){});
+                        throwResume &(yin){};
                 } catchResume (yin *) {
                         printf("caught first exception\n");
                         try {
                                 printf("throwing second exception\n");
                                 THROW_RESUME(&(yang){});
+                                throwResume &(yang){};
                         } catchResume (yang *) {
                                 printf("caught second exception\n");
 …
                 printf("caught second exception (bad location)\n");
+        }
+        printf("\n");
+        // Check successive operations.
+        try {
+                try {
+                        throwResume &(zen){};
+                        throwResume &(zen){};
+                } catchResume (zen *) {
+                        printf("inner catch\n");
+                }
+                throwResume &(zen){};
+        } catchResume (zen *) {
+                printf("outer catch\n");
+        }
+}

tests/exceptions/terminate.cfa

-              re3bc51c
+              rbcd74f3
 // Termination Exception Tests
 #include "except-mac.hfa"
+#include <exception.hfa>
 #include "except-io.hfa"
 …
                 loud_exit a = "simple try clause";
                 printf("simple throw\n");
                 THROW(&(zen){});
+                throw &(zen){};
                 printf("end of try clause\n");
         } catch (zen * error) {
 …
         try {
                 printf("throwing child exception\n");
                 THROW(&(moment_of){});
+                throw &(moment_of){};
         } catch (zen *) {
                 printf("inner parent match\n");
 …
         try {
                 try {
                         THROW(&(yin){});
+                        throw &(yin){};
                 } catch (zen *) {
                         printf("caught yin as zen\n");
 …
                         loud_exit a = "rethrow inner try";
                         printf("rethrow inner try\n");
                         THROW(&(zen){});
+                        throw &(zen){};
                 } catch (zen *) {
                         loud_exit a = "rethrowing catch clause";
 …
         try {
                 try {
                         THROW(&(yin){});
+                        throw &(yin){};
                 } catch (yin *) {
                         printf("caught yin, will throw yang\n");
                         THROW(&(yang){});
+                        throw &(yang){};
                 } catch (yang *) {
                         printf("caught exception from same try\n");
 …
                 try {
                         printf("throwing first exception\n");
                         THROW(&(yin){});
+                        throw &(yin){};
                 } catch (yin *) {
                         printf("caught first exception\n");
                         try {
                                 printf("throwing second exception\n");
                                 THROW(&(yang){});
+                                throw &(yang){};
                         } catch (yang *) {
                                 printf("caught second exception\n");
 …
                 printf("caught second exception (bad location)\n");
+        }
+        printf("\n");
+        // Check successive operations.
+        try {
+                try {
+                        throw &(zen){};
+                        throw &(zen){};
+                } catch (zen *) {
+                        printf("inner catch\n");
+                }
+                throw &(zen){};
+        } catch (zen *) {
+                printf("outer catch\n");
+        }
+}

tests/manipulatorsOutput1.cfa

-              re3bc51c
+              rbcd74f3
 // Created On       : Sat Jun  8 18:04:11 2019
 // Last Modified By : Peter A. Buhr
 // Last Modified On : Mon Jun 10 12:37:28 2019
 // Update Count     : 8
+// Last Modified On : Fri May  1 11:51:44 2020
+// Update Count     : 9
 //
 …
         signed char sc = -12;
         printf( "%hhd %2hhd %5.2hhd %-5.2hhd %hho %#hho %hhx %#hhx %#8hhx %#8.10hhx %#8.3hhX %+-8.3hhd %08hhd\n", sc, sc, sc, sc, sc, sc, sc, sc, sc, sc, sc, sc, sc );
+        sout | sc | wd(2,sc) | wd(5,2,sc) | left(wd(5,2,sc)) | nobase(oct(sc)) | oct(sc) | nobase(hex(sc)) | hex(sc) | wd(8,hex(sc)) | wd(8,10,hex(sc)) | upcase(wd(8,3,hex(sc))) | left(sign(upcase(wd(8,3,sc)))) | pad0(wd(8,sc));
+        sout | sc | wd(2,sc) | wd(5,2,sc) | left(wd(5,2,sc)) | nobase(oct(sc)) | oct(sc) | nonl;
+        sout | nobase(hex(sc)) | hex(sc) | wd(8,hex(sc)) | wd(8,10,hex(sc)) | upcase(wd(8,3,hex(sc))) | nonl;
+        sout | left(sign(upcase(wd(8,3,sc)))) | pad0(wd(8,sc));
         sout | "unsigned char";
         unsigned char usc = 12;
         printf( "%hhu %2hhu %5.2hhu %-5.2hhu %hho %#hho %hhx %#hhx %#8hhx %#8.10hhx %#8.3hhX %-8.3hhu %08hhu\n", usc, usc, usc, usc, usc, usc, usc, usc, usc, usc, usc, usc, usc );
+        sout | usc | wd(2,usc) | wd(5,2,usc) | left(wd(5,2,usc)) | nobase(oct(usc)) | oct(usc) | nobase(hex(usc)) | hex(usc) | wd(8,hex(usc)) | wd(8,10,hex(usc)) | upcase(wd(8,3,hex(usc))) | left(upcase(wd(8,3,usc))) | pad0(wd(8,usc));
+        sout | usc | wd(2,usc) | wd(5,2,usc) | left(wd(5,2,usc)) | nobase(oct(usc)) | oct(usc) | nonl;
+        sout | nobase(hex(usc)) | hex(usc) | wd(8,hex(usc)) | wd(8,10,hex(usc)) | upcase(wd(8,3,hex(usc))) | nonl;
+        sout | left(upcase(wd(8,3,usc))) | pad0(wd(8,usc));
         sout | "signed short int";
         signed short int si = -12;
         printf( "%hd %2hd %5.2hd %-5.2hd %ho %#ho %hx %#hx %#8hx %#8.10hx %#8.3hX %+-8.3hd %08hd\n", si, si, si, si, si, si, si, si, si, si, si, si, si );
+        sout | si | wd(2,si) | wd(5,2,si) | left(wd(5,2,si)) | nobase(oct(si)) | oct(si) | nobase(hex(si)) | hex(si) | wd(8,hex(si)) | wd(8,10,hex(si)) | upcase(wd(8,3,hex(si))) | left(sign(upcase(wd(8,3,si)))) | pad0(wd(8,si));
+        sout | si | wd(2,si) | wd(5,2,si) | left(wd(5,2,si)) | nobase(oct(si)) | oct(si) | nonl;
+        sout | nobase(hex(si)) | hex(si) | wd(8,hex(si)) | wd(8,10,hex(si)) | upcase(wd(8,3,hex(si))) | nonl;
+        sout | left(sign(upcase(wd(8,3,si)))) | pad0(wd(8,si));
         sout | "unsigned short int";
         unsigned short int usi = 12;
         printf( "%hu %2hu %5.2hu %-5.2hu %ho %#ho %hx %#hx %#8hx %#8.10hx %#8.3hX %-8.3hu %08hu\n", usi, usi, usi, usi, usi, usi, usi, usi, usi, usi, usi, usi, usi );
+        sout | usi | wd(2,usi) | wd(5,2,usi) | left(wd(5,2,usi)) | nobase(oct(usi)) | oct(usi) | nobase(hex(usi)) | hex(usi) | wd(8,hex(usi)) | wd(8,10,hex(usi)) | upcase(wd(8,3,hex(usi))) | left(upcase(wd(8,3,usi))) | pad0(wd(8,usi));
+        sout | usi | wd(2,usi) | wd(5,2,usi) | left(wd(5,2,usi)) | nobase(oct(usi)) | oct(usi) | nonl;
+        sout | nobase(hex(usi)) | hex(usi) | wd(8,hex(usi)) | wd(8,10,hex(usi)) | upcase(wd(8,3,hex(usi))) | nonl;
+        sout | left(upcase(wd(8,3,usi))) | pad0(wd(8,usi));
         sout | "signed int";
         signed int i = -12;
         printf( "%d %2d %5.2d %-5.2d %o %#o %x %#x %#8x %#8.10x %#8.3X %+-8.3d %08d\n", i, i, i, i, i, i, i, i, i, i, i, i, i );
+        sout | i | wd(2,i) | wd(5,2,i) | left(wd(5,2,i)) | nobase(oct(i)) | oct(i) | nobase(hex(i)) | hex(i) | wd(8,hex(i)) | wd(8,10,hex(i)) | upcase(wd(8,3,hex(i))) | left(sign(upcase(wd(8,3,i)))) | pad0(wd(8,i));
+        sout | i | wd(2,i) | wd(5,2,i) | left(wd(5,2,i)) | nobase(oct(i)) | oct(i) | nonl;
+        sout | nobase(hex(i)) | hex(i) | wd(8,hex(i)) | wd(8,10,hex(i)) | upcase(wd(8,3,hex(i))) | nonl;
+        sout | left(sign(upcase(wd(8,3,i)))) | pad0(wd(8,i));
         sout | "unsigned int";
         unsigned int ui = 12;
         printf( "%u %2u %5.2u %-5.2u %o %#o %x %#x %#8x %#8.10x %#8.3X %-8.3u %08u\n", ui, ui, ui, ui, ui, ui, ui, ui, ui, ui, ui, ui, ui );
+        sout | ui | wd(2,ui) | wd(5,2,ui) | left(wd(5,2,ui)) | nobase(oct(ui)) | oct(ui) | nobase(hex(ui)) | hex(ui) | wd(8,hex(ui)) | wd(8,10,hex(ui)) | upcase(wd(8,3,hex(ui))) | left(upcase(wd(8,3,ui))) | pad0(wd(8,ui));
+        sout | ui | wd(2,ui) | wd(5,2,ui) | left(wd(5,2,ui)) | nobase(oct(ui)) | oct(ui) | nonl;
+        sout | nobase(hex(ui)) | hex(ui) | wd(8,hex(ui)) | wd(8,10,hex(ui)) | upcase(wd(8,3,hex(ui))) | nonl;
+        sout | left(upcase(wd(8,3,ui))) | pad0(wd(8,ui));
         sout | "signed long long int";
         signed long long int lli = -12;
         printf( "%lld %2lld %5.2lld %-5.2lld %llo %#llo %llx %#llx %#8llx %#8.10llx %#8.3llX %+-8.3lld %08lld\n", lli, lli, lli, lli, lli, lli, lli, lli, lli, lli, lli, lli, lli );
+        sout | lli | wd(2,lli) | wd(5,2,lli) | left(wd(5,2,lli)) | nobase(oct(lli)) | oct(lli) | nobase(hex(lli)) | hex(lli) | wd(8,hex(lli)) | wd(8,10,hex(lli)) | upcase(wd(8,3,hex(lli))) | left(sign(upcase(wd(8,3,lli)))) | pad0(wd(8,lli));
+        sout | lli | wd(2,lli) | wd(5,2,lli) | left(wd(5,2,lli)) | nobase(oct(lli)) | oct(lli) | nonl;
+        sout | nobase(hex(lli)) | hex(lli) | wd(8,hex(lli)) | wd(8,10,hex(lli)) | upcase(wd(8,3,hex(lli))) | nonl;
+        sout | left(sign(upcase(wd(8,3,lli)))) | pad0(wd(8,lli));
         sout | "unsigned long long int";
         unsigned long long int ulli = 12;
         printf( "%llu %2llu %5.2llu %-5.2llu %llo %#llo %llx %#llx %#8llx %#8.10llx %#8.3llX %-8.3llu %08llu\n", ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli, ulli );
+        sout | ulli | wd(2,ulli) | wd(5,2,ulli) | left(wd(5,2,ulli)) | nobase(oct(ulli)) | oct(ulli) | nobase(hex(ulli)) | hex(ulli) | wd(8,hex(ulli)) | wd(8,10,hex(ulli)) | upcase(wd(8,3,hex(ulli))) | left(upcase(wd(8,3,ulli))) | pad0(wd(8,ulli));
+        sout | ulli | wd(2,ulli) | wd(5,2,ulli) | left(wd(5,2,ulli)) | nobase(oct(ulli)) | oct(ulli) | nonl;
+        sout | nobase(hex(ulli)) | hex(ulli) | wd(8,hex(ulli)) | wd(8,10,hex(ulli)) | upcase(wd(8,3,hex(ulli))) | nonl;
+        sout | left(upcase(wd(8,3,ulli))) | pad0(wd(8,ulli));
         sout | nl | "binary integral";
+        sout | bin(0) | bin(13) | upcase(bin(13)) | nobase(bin(13)) | left(wd(8,bin(13))) | wd(8,bin(13)) | pad0(left(wd(8,bin(13)))) | pad0(wd(8,bin(13))) | pad0(wd(8,10,bin(13))) | pad0(wd(8,6,bin(13)));
+        sout | bin(0) | bin(13) | upcase(bin(13)) | nobase(bin(13)) | left(wd(8,bin(13))) | wd(8,bin(13)) | nonl;
+        sout | pad0(left(wd(8,bin(13)))) | pad0(wd(8,bin(13))) | pad0(wd(8,10,bin(13))) | pad0(wd(8,6,bin(13)));
 …
         printf( "%g  %8g %#8g %g %8g %8.0g %#8.0g %8.2g %#8.2g %-8.2g %-8.2g %-#8.2g %-+8.2g %-+#8.2g %08.2g %8.2E %8.2a %#8.2A %#8.2e\n",
 .0,3.0F,3.0F, f,  f,    f,     f,    f,     f,  3.0F,      f,      f,      f,       f,     f,    f,    f,     f,     f );
+        sout | 0.0 | wd(8, 3.0F) | nodp(wd(8, 3.0F)) | f | wd(8, f) | ws(8,0, f) | nodp(ws(8,0, f)) | ws(8,2, f) | nodp(ws(8,2, f)) | left(ws(8,2, 3.0F)) | left(ws(8,2, f)) | left(nodp(ws(8,2, f))) | left(sign(ws(8,2, f))) | left(sign(nodp(ws(8,2, f)))) | pad0(ws(8,2, f)) | upcase(wd(8,2, sci(f))) | wd(8,2, hex(f)) | upcase(wd(8,2, hex(f))) | nodp(wd(8,2, sci(f)));
+        sout | 0.0 | wd(8, 3.0F) | nodp(wd(8, 3.0F)) | f | wd(8, f) | ws(8,0, f) | nodp(ws(8,0, f)) | ws(8,2, f) | nodp(ws(8,2, f)) | nonl;
+        sout | left(ws(8,2, 3.0F)) | left(ws(8,2, f)) | left(nodp(ws(8,2, f))) | left(sign(ws(8,2, f))) | left(sign(nodp(ws(8,2, f)))) | nonl;
+        sout | pad0(ws(8,2, f)) | upcase(wd(8,2, sci(f))) | wd(8,2, hex(f)) | upcase(wd(8,2, hex(f))) | nodp(wd(8,2, sci(f)));
         sout | "double";
 …
         printf( "%g  %#8f %g %8f %#8.0f %8.0f %8.2f %-8.2f %-+#8.2f %08.2F %8.2E %8.2a %8.2A %8.2e\n",
 .0,  3.0, d,  d,     d,    d,    d,     d,       d,     d,    d,    d,    d,    d );
+        sout | 0.0 | wd(8, 3.0) | d | wd(8, d) | nodp(wd(8,0, d)) | wd(8,0, d) | wd(8,2, d) | left(wd(8,2, d)) | left(sign(wd(8,2, d))) | pad0(upcase(wd(8,2, d))) | upcase(wd(8,2, sci(d))) | wd(8,2, hex(d)) | upcase(wd(8,2, hex(d))) | wd(8,2, sci(d));
+        sout | 0.0 | wd(8, 3.0) | d | wd(8, d) | nodp(wd(8,0, d)) | wd(8,0, d) | wd(8,2, d) | nonl;
+        sout | left(wd(8,2, d)) | left(sign(wd(8,2, d))) | pad0(upcase(wd(8,2, d))) | upcase(wd(8,2, sci(d))) | wd(8,2, hex(d)) | upcase(wd(8,2, hex(d))) | wd(8,2, sci(d));
         sout | "long double";
 …
         printf( "%Lg  %#8Lf %Lg %8Lf %#8.0Lf %8.0Lf %8.2Lf %-8.2Lf %-+#8.2Lf %08.2LF %8.2LE %8.2La %8.2LA %8.2Le\n",
 .0L,  3.0L, ld,  ld,     ld,    ld,    ld,     ld,       ld,     ld,    ld,    ld,    ld,    ld );
+        sout | 0.0L | wd(8, 3.0L) | ld | wd(8, ld) | nodp(wd(8,0, ld)) | wd(8,0, ld) | wd(8,2, ld) | left(wd(8,2, ld)) | left(sign(wd(8,2, ld))) | pad0(upcase(wd(8,2, ld))) | upcase(wd(8,2, sci(ld))) | wd(8,2, hex(ld)) | upcase(wd(8,2, hex(ld))) | wd(8,2, sci(ld));
+        sout | 0.0L | wd(8, 3.0L) | ld | wd(8, ld) | nodp(wd(8,0, ld)) | wd(8,0, ld) | wd(8,2, ld) | nonl;
+        sout | left(wd(8,2, ld)) | left(sign(wd(8,2, ld))) | pad0(upcase(wd(8,2, ld))) | upcase(wd(8,2, sci(ld))) | wd(8,2, hex(ld)) | upcase(wd(8,2, hex(ld))) | wd(8,2, sci(ld));
 …
         char c = 'a';
         printf( "%c %2c %5c %-5c %hho %#hho %hhx %#hhx %#8hhx %#8hhX %-8c %8c\n", c, c, c, c, c, c, c, c, c, c, c, c );
+        sout | c | ' ' | wd(2,c) | wd(5,c) | left(wd(5,c)) | nobase(oct(c)) | oct(c) | nobase(hex(c)) | hex(c) | wd(8,hex(c)) | upcase(wd(8,hex(c))) | left(wd(8,c)) | wd(8,c);
+        sout | c | ' ' | wd(2,c) | wd(5,c) | left(wd(5,c)) | nobase(oct(c)) | oct(c) | nonl;
+        sout | nobase(hex(c)) | hex(c) | wd(8,hex(c)) | upcase(wd(8,hex(c))) | left(wd(8,c)) | wd(8,c);
         sout | nl | "string";

tests/pybin/settings.py

-              re3bc51c
+              rbcd74f3
 class Architecture:
         KnownArchitectures = {
                 'x64'           : 'x64',
                 'x86-64'        : 'x64',
                 'x86_64'        : 'x64',
                 'x86'           : 'x86',
                 'aarch64'       : 'arm',
                 'i386'          : 'x86',
                 'i486'          : 'x86',
                 'i686'          : 'x86',
                 'Intel 80386'   : 'x86',
                 'arm'           : 'arm',
                 'ARM'           : 'arm',
+                'x64'         : 'x64',
+                'x86-64'      : 'x64',
+                'x86_64'      : 'x64',
+                'x86'         : 'x86',
+                'aarch64'     : 'arm',
+                'i386'        : 'x86',
+                'i486'        : 'x86',
+                'i686'        : 'x86',
+                'Intel 80386' : 'x86',
+                'arm'         : 'arm',
+                'ARM'         : 'arm',
+        }
 …
                 return True if not arch else self.target == arch
         @classmethod
         def make_canonical(_, arch):
+        @staticmethod
+        def make_canonical(arch):
                 return Architecture.KnownArchitectures[arch]
 …
                 self.total  = Timeouts.check(tg)
         @classmethod
         def check(_, value):
+        @staticmethod
+        def check(value):
                 if value < 1:
                         print("Timeouts must be at least 1 second", file=sys.stderr)
 …
         global timeout2gdb
         all_arch     = [Architecture(o) for o in list(dict.fromkeys(options.arch   ))]
+        all_arch     = [Architecture(o) for o in list(dict.fromkeys(options.arch   ))] if options.arch else [Architecture(None)]
         all_debug    = [Debug(o)        for o in list(dict.fromkeys(options.debug  ))]
         all_install  = [Install(o)      for o in list(dict.fromkeys(options.install))]

tests/pybin/test_run.py

-              re3bc51c
+              rbcd74f3
                 return os.path.normpath( os.path.join(settings.BUILDDIR, self.path, self.name) )
         @classmethod
         def valid_name(_, name):
+        @staticmethod
+        def valid_name(name):
                 return not name.endswith( ('.c', '.cc', '.cpp', '.cfa') )
         @classmethod
         def from_target(_, target):
+        @staticmethod
+        def new_target(target, arch):
                 test = Test()
                 test.name = os.path.basename(target)
                 test.path = os.path.relpath (os.path.dirname(target), settings.SRCDIR)
                 test.arch = settings.arch.target if settings.arch.cross_compile else ''
+                test.arch = arch.target if arch else ''
                 return test
 …
                 return text
         @classmethod
         def fmtDur( cls, duration ):
+        @staticmethod
+        def fmtDur( duration ):
                 if duration :
                         hours, rem = divmod(duration, 3600)

tests/test.py

-              re3bc51c
+              rbcd74f3
                 for testname in options.tests :
                         testname = canonical_path( testname )
+                        # first check if this is a valid name to regenerate
                         if Test.valid_name(testname):
+                                # this is a valid name, let's check if it already exists
                                 found = [test for test in all_tests if canonical_path( test.target() ) == testname]
+                                tests.append( found[0] if len(found) == 1 else Test.from_target(testname) )
+                                if not found:
+                                        # it's a new name, create it according to the name and specified architecture
+                                        if options.arch:
+                                                # user specified one or multiple architectures, assume the tests will have architecture specific results
+                                                tests.extend( [Test.new_target(testname, arch) for arch in settings.all_arch] )
+                                        else:
+                                                # user didn't specify an architecture, just create a cross platform test
+                                                tests.append( Test.new_target( testname, None ) )
+                                elif len(found) == 1 and not found[0].arch:
+                                        # we found a single test, the user better be wanting to create a cross platform test
+                                        if options.arch:
+                                                print('ERROR: "%s", test has no specified architecture but --arch was specified, ignoring it' % testname, file=sys.stderr)
+                                        else:
+                                                tests.append( found[0] )
+                                else:
+                                        # this test is already cross platform, just add a test for each platform the user asked
+                                        tests.extend( [Test.new_target(testname, arch) for arch in settings.all_arch] )
+                                        # print a warning if it users didn't ask for a specific architecture
+                                        if not options.arch:
+                                                print('WARNING: "%s", test has architecture specific expected files but --arch was not specified, regenerating only for current host' % testname, file=sys.stderr)
                         else :
                                 print('ERROR: "%s", tests are not allowed to end with a C/C++/CFA extension, ignoring it' % testname, file=sys.stderr)
 …
         parser.add_argument('--debug', help='Run all tests in debug or release', type=comma_separated(yes_no), default='yes')
         parser.add_argument('--install', help='Run all tests based on installed binaries or tree binaries', type=comma_separated(yes_no), default='no')
         parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default='')
+        parser.add_argument('--arch', help='Test for specific architecture', type=comma_separated(str), default=None)
         parser.add_argument('--continue', help='When multiple specifications are passed (debug/install/arch), sets whether or not to continue if the last specification failed', type=yes_no, default='yes', dest='continue_')
         parser.add_argument('--timeout', help='Maximum duration in seconds after a single test is considered to have timed out', type=int, default=60)
 …
                 else:
+                        with open (out_file, "r") as myfile:
+                                error = myfile.read()
+                        if os.stat(out_file).st_size < 1048576:
+                                with open (out_file, "r") as myfile:
+                                        error = myfile.read()
+                        else:
+                                error = "Output log can't be read, file is bigger than 1MB, see {} for actual error\n".format(out_file)
                         ret, info = core_info(exe_file)
 …
                 return False, ""
         except Exception as ex:
                 print("Unexpected error in worker thread: %s" % ex, file=sys.stderr)
+                print("Unexpected error in worker thread running {}: {}".format(t.target(), ex), file=sys.stderr)
                 sys.stderr.flush()
                 return False, ""

tests/vector.cfa

-              re3bc51c
+              rbcd74f3
 //
+#include <vector.hfa>
 #include <fstream.hfa>
-#include <vector.hfa>
 #undef assert
 …
 int main() {
         vector( int ) iv;
+        assert( ((uintptr_t)&iv.storage.storage ) == (((uintptr_t)&iv)) );
+        assert( ((uintptr_t)&iv.storage.capacity) == (((uintptr_t)&iv) + sizeof(void *)) );
+        assert( ((uintptr_t)&iv.size            ) == (((uintptr_t)&iv) + sizeof(void *) + sizeof(size_t)) );
         assert( empty( &iv ) );

tools/build/push2dist.sh

re3bc51c	rbcd74f3
19	19	# echo "Copying to machines : ${hosts} (hash=${hash})"
20	20
21		files="../../../driver/cfa ../../../driver/cfa-cpp ../../../driver/cc1 ../../../driver/as $(find . -name '.c' \| tr '\n' ' ')"
	21	files="../../../driver/cfa ../../../driver/cfa-cpp ../../../driver/cc1 ../../../driver/as defines.hfa $(find . -name '.c' \| tr '\n' ' ')"
22	22	# echo "Files ${files}"
23	23

tools/gdb/utils-gdb.py

-              re3bc51c
+              rbcd74f3
 class ThreadInfo:
     tid = 0
     cluster = None
     value = None
     def __init__(self, cluster, value):
         self.cluster = cluster
         self.value = value
     def is_system(self):
         return False
+        tid = 0
+        cluster = None
+        value = None
+        def __init__(self, cluster, value):
+                self.cluster = cluster
+                self.value = value
+        def is_system(self):
+                return False
 # A named tuple representing information about a stack
 …
 def is_cforall():
     return True
+        return True
 def get_cfa_types():
     # GDB types for various structures/types in CFA
     return CfaTypes(cluster_ptr = gdb.lookup_type('struct cluster').pointer(),
                   processor_ptr = gdb.lookup_type('struct processor').pointer(),
                      thread_ptr = gdb.lookup_type('struct $thread').pointer(),
                         int_ptr = gdb.lookup_type('int').pointer(),
                    thread_state = gdb.lookup_type('enum coroutine_state'))
+        # GDB types for various structures/types in CFA
+        return CfaTypes(cluster_ptr = gdb.lookup_type('struct cluster').pointer(),
+                                  processor_ptr = gdb.lookup_type('struct processor').pointer(),
+                                         thread_ptr = gdb.lookup_type('struct $thread').pointer(),
+                                                int_ptr = gdb.lookup_type('int').pointer(),
+                                   thread_state = gdb.lookup_type('enum coroutine_state'))
 def get_addr(addr):
     """
     NOTE: sketchy solution to retrieve address. There is a better solution...
     @addr: str of an address that can be in a format 0xfffff <type of the object
     at this address>
     Return: str of just the address
     """
     str_addr = str(addr)
     ending_addr_index = str_addr.find('<')
     if ending_addr_index == -1:
         return str(addr)
     return str_addr[:ending_addr_index].strip()
+        """
+        NOTE: sketchy solution to retrieve address. There is a better solution...
+        @addr: str of an address that can be in a format 0xfffff <type of the object
+        at this address>
+        Return: str of just the address
+        """
+        str_addr = str(addr)
+        ending_addr_index = str_addr.find('<')
+        if ending_addr_index == -1:
+                return str(addr)
+        return str_addr[:ending_addr_index].strip()
 def print_usage(obj):
     print(obj.__doc__)
+        print(obj.__doc__)
 def parse(args):
     """
     Split the argument list in string format, where each argument is separated
     by whitespace delimiter, to a list of arguments like argv
     @args: str of arguments
     Return:
         [] if args is an empty string
         list if args is not empty
     """
     # parse the string format of arguments and return a list of arguments
     argv = args.split(' ')
     if len(argv) == 1 and argv[0] == '':
         return []
     return argv
+        """
+        Split the argument list in string format, where each argument is separated
+        by whitespace delimiter, to a list of arguments like argv
+        @args: str of arguments
+        Return:
+                [] if args is an empty string
+                list if args is not empty
+        """
+        # parse the string format of arguments and return a list of arguments
+        argv = args.split(' ')
+        if len(argv) == 1 and argv[0] == '':
+                return []
+        return argv
 def get_cluster_root():
     """
     Return: gdb.Value of globalClusters.root (is an address)
     """
     cluster_root = gdb.parse_and_eval('_X11mainClusterPS7cluster_1')
     if cluster_root.address == 0x0:
         print('No clusters, program terminated')
     return cluster_root
+        """
+        Return: gdb.Value of globalClusters.root (is an address)
+        """
+        cluster_root = gdb.parse_and_eval('_X11mainClusterPS7cluster_1')
+        if cluster_root.address == 0x0:
+                print('No clusters, program terminated')
+        return cluster_root
 def find_curr_thread():
+    # btstr = gdb.execute('bt', to_string = True).splitlines()
+    # if len(btstr) == 0:
+    #     print('error')
+    #     return None
+    # return btstr[0].split('this=',1)[1].split(',')[0].split(')')[0]
+    return None
+        # btstr = gdb.execute('bt', to_string = True).splitlines()
+        # if len(btstr) == 0:
+        #     print('error')
+        #     return None
+        # return btstr[0].split('this=',1)[1].split(',')[0].split(')')[0]
+        return None
+def all_clusters():
+        if not is_cforall():
+                return None
+        cluster_root = get_cluster_root()
+        if cluster_root.address == 0x0:
+                return
+        curr = cluster_root
+        ret = [curr]
+        while True:
+                curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
+                if curr == cluster_root:
+                        break
+                ret.append(curr)
+        return ret
 def lookup_cluster(name = None):
     """
     Look up a cluster given its ID
     @name: str
     Return: gdb.Value
     """
     if not is_cforall():
         return None
     root = get_cluster_root()
     if root.address == 0x0:
         return None
     if not name:
         return root
     # lookup for the task associated with the id
     cluster = None
     curr = root
     while True:
         if curr['_X4namePKc_1'].string() == name:
             cluster = curr.address
             break
         curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
         if curr == root or curr == 0x0:
             break
     if not cluster:
         print("Cannot find a cluster with the name: {}.".format(name))
         return None
     return cluster
+        """
+        Look up a cluster given its ID
+        @name: str
+        Return: gdb.Value
+        """
+        if not is_cforall():
+                return None
+        root = get_cluster_root()
+        if root.address == 0x0:
+                return None
+        if not name:
+                return root
+        # lookup for the task associated with the id
+        cluster = None
+        curr = root
+        while True:
+                if curr['_X4namePKc_1'].string() == name:
+                        cluster = curr.address
+                        break
+                curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
+                if curr == root or curr == 0x0:
+                        break
+        if not cluster:
+                print("Cannot find a cluster with the name: {}.".format(name))
+                return None
+        return cluster
 def lookup_threads_by_cluster(cluster):
         # Iterate through a circular linked list of threads and accumulate them in an array
         threads = []
         cfa_t = get_cfa_types()
         root = cluster['_X7threadsS8__dllist_S7$thread__1']['_X4headPY15__TYPE_generic__1'].cast(cfa_t.thread_ptr)
         if root == 0x0 or root.address == 0x0:
             print('There are no tasks for cluster: {}'.format(cluster))
             return threads
         curr = root
         tid = 0
         sid = -1
         while True:
             t = ThreadInfo(cluster, curr)
             if t.is_system():
                 t.tid = sid
                 sid -= 1
             else:
                 t.tid = tid
                 tid += 1
             threads.append(t)
             curr = curr['node']['next']
             if curr == root or curr == 0x0:
                 break
         return threads
+                # Iterate through a circular linked list of threads and accumulate them in an array
+                threads = []
+                cfa_t = get_cfa_types()
+                root = cluster['_X7threadsS8__dllist_S7$thread__1']['_X4headPY15__TYPE_generic__1'].cast(cfa_t.thread_ptr)
+                if root == 0x0 or root.address == 0x0:
+                        print('There are no tasks for cluster: {}'.format(cluster))
+                        return threads
+                curr = root
+                tid = 0
+                sid = -1
+                while True:
+                        t = ThreadInfo(cluster, curr)
+                        if t.is_system():
+                                t.tid = sid
+                                sid -= 1
+                        else:
+                                t.tid = tid
+                                tid += 1
+                        threads.append(t)
+                        curr = curr['node']['next']
+                        if curr == root or curr == 0x0:
+                                break
+                return threads
 def system_thread(thread):
     return False
+        return False
 def adjust_stack(pc, fp, sp):
     # pop sp, fp, pc from global stack
     gdb.execute('set $pc = {}'.format(pc))
     gdb.execute('set $rbp = {}'.format(fp))
     gdb.execute('set $sp = {}'.format(sp))
+        # pop sp, fp, pc from global stack
+        gdb.execute('set $pc = {}'.format(pc))
+        gdb.execute('set $rbp = {}'.format(fp))
+        gdb.execute('set $sp = {}'.format(sp))
 ############################ COMMAND IMPLEMENTATION #########################
 class Clusters(gdb.Command):
     """Cforall: Display currently known clusters
+        """Cforall: Display currently known clusters
 Usage:
     info clusters                 : print out all the clusters
+        info clusters                 : print out all the clusters
 """
+    def __init__(self):
+        super(Clusters, self).__init__('info clusters', gdb.COMMAND_USER)
+    def print_cluster(self, cluster_name, cluster_address):
+        print('{:>20}  {:>20}'.format(cluster_name, cluster_address))
+    #entry point from gdb
+    def invoke(self, arg, from_tty):
+        if not is_cforall():
+            return
+        if arg:
+            print("info clusters does not take arguments")
+            print_usage(self)
+            return
+        cluster_root = get_cluster_root()
+        if cluster_root.address == 0x0:
+            return
+        curr = cluster_root
+        self.print_cluster('Name', 'Address')
+        while True:
+            self.print_cluster(curr['_X4namePKc_1'].string(), str(curr))
+            curr = curr['_X4nodeS26__cluster____dbg_node_cltr_1']['_X4nextPS7cluster_1']
+            if curr == cluster_root:
+                break
+        print("")
+        def __init__(self):
+                super(Clusters, self).__init__('info clusters', gdb.COMMAND_USER)
+        def print_cluster(self, cluster_name, cluster_address):
+                print('{:>20}  {:>20}'.format(cluster_name, cluster_address))
+        #entry point from gdb
+        def invoke(self, arg, from_tty):
+                if not is_cforall():
+                        return
+                if arg:
+                        print("info clusters does not take arguments")
+                        print_usage(self)
+                        return
+                self.print_cluster('Name', 'Address')
+                for c in all_clusters():
+                        self.print_cluster(c['_X4namePKc_1'].string(), str(c))
+                print("")
 ############
 class Processors(gdb.Command):
     """Cforall: Display currently known processors
+        """Cforall: Display currently known processors
 Usage:
+    info processors                 : print out all the processors in the Main Cluster
+    info processors <cluster_name>  : print out all processors in a given cluster
+        info processors                 : print out all the processors in the Main Cluster
+        info processors all             : print out all processors in all clusters
+        info processors <cluster_name>  : print out all processors in a given cluster
 """
+    def __init__(self):
+        super(Processors, self).__init__('info processors', gdb.COMMAND_USER)
+    def print_processor(self, name, status, pending, address):
+        print('{:>20}  {:>11}  {:>13}  {:>20}'.format(name, status, pending, address))
+    def iterate_procs(self, root, active):
+        if root == 0x0:
+            return
+        cfa_t = get_cfa_types()
+        curr = root
+        while True:
+            processor = curr
+            should_stop = processor['_X12do_terminateVb_1']
+            stop_count  = processor['_X10terminatedS9semaphore_1']['_X5counti_1']
+            if not should_stop:
+                status = 'Active' if active else 'Idle'
+            else:
+                status_str  = 'Last Thread' if stop_count >= 0 else 'Terminating'
+                status      = '{}({},{})'.format(status_str, should_stop, stop_count)
+            self.print_processor(processor['_X4namePKc_1'].string(),
+                    status, str(processor['_X18pending_preemptionb_1']), str(processor)
+                )
+            curr = curr['_X4nodeS28__processor____dbg_node_proc_1']['_X4nextPS9processor_1']
+            if curr == root or curr == 0x0:
+                break
+    #entry point from gdb
+    def invoke(self, arg, from_tty):
+        if not is_cforall():
+            return
+        cluster = lookup_cluster(arg if arg else None)
+        if not cluster:
+            print("No Cluster matching arguments found")
+            return
+        cfa_t = get_cfa_types()
+        print('Cluster: "{}"({})'.format(cluster['_X4namePKc_1'].string(), cluster.cast(cfa_t.cluster_ptr)))
+        active_root = cluster.cast(cfa_t.cluster_ptr) \
+                ['_X5procsS8__dllist_S9processor__1'] \
+                ['_X4headPY15__TYPE_generic__1'] \
+                .cast(cfa_t.processor_ptr)
+        idle_root = cluster.cast(cfa_t.cluster_ptr) \
+                ['_X5idlesS8__dllist_S9processor__1'] \
+                ['_X4headPY15__TYPE_generic__1'] \
+                .cast(cfa_t.processor_ptr)
+        if idle_root != 0x0 or active_root != 0x0:
+            self.print_processor('Name', 'Status', 'Pending Yield', 'Address')
+            self.iterate_procs(active_root, True)
+            self.iterate_procs(idle_root, False)
+        else:
+            print("No processors on cluster")
+        print()
+        def __init__(self):
+                super(Processors, self).__init__('info processors', gdb.COMMAND_USER)
+        def print_processor(self, name, status, pending, address):
+                print('{:>20}  {:>11}  {:>13}  {:>20}'.format(name, status, pending, address))
+        def iterate_procs(self, root, active):
+                if root == 0x0:
+                        return
+                cfa_t = get_cfa_types()
+                curr = root
+                while True:
+                        processor = curr
+                        should_stop = processor['_X12do_terminateVb_1']
+                        stop_count  = processor['_X10terminatedS9semaphore_1']['_X5counti_1']
+                        if not should_stop:
+                                status = 'Active' if active else 'Idle'
+                        else:
+                                status_str  = 'Last Thread' if stop_count >= 0 else 'Terminating'
+                                status      = '{}({},{})'.format(status_str, should_stop, stop_count)
+                        self.print_processor(processor['_X4namePKc_1'].string(),
+                                        status, str(processor['_X18pending_preemptionb_1']), str(processor)
+                                )
+                        curr = curr['_X4nodeS28__processor____dbg_node_proc_1']['_X4nextPS9processor_1']
+                        if curr == root or curr == 0x0:
+                                break
+        #entry point from gdb
+        def invoke(self, arg, from_tty):
+                if not is_cforall():
+                        return
+                if not arg:
+                        clusters = [lookup_cluster(None)]
+                elif arg == "all":
+                        clusters = all_clusters()
+                else:
+                        clusters = [lookup_cluster(arg)]
+                if not clusters:
+                        print("No Cluster matching arguments found")
+                        return
+                cfa_t = get_cfa_types()
+                for cluster in clusters:
+                        print('Cluster: "{}"({})'.format(cluster['_X4namePKc_1'].string(), cluster.cast(cfa_t.cluster_ptr)))
+                        active_root = cluster.cast(cfa_t.cluster_ptr) \
+                                        ['_X5procsS8__dllist_S9processor__1'] \
+                                        ['_X4headPY15__TYPE_generic__1'] \
+                                        .cast(cfa_t.processor_ptr)
+                        idle_root = cluster.cast(cfa_t.cluster_ptr) \
+                                        ['_X5idlesS8__dllist_S9processor__1'] \
+                                        ['_X4headPY15__TYPE_generic__1'] \
+                                        .cast(cfa_t.processor_ptr)
+                        if idle_root != 0x0 or active_root != 0x0:
+                                self.print_processor('Name', 'Status', 'Pending Yield', 'Address')
+                                self.iterate_procs(active_root, True)
+                                self.iterate_procs(idle_root, False)
+                        else:
+                                print("No processors on cluster")
+                print()
 ############
 class Threads(gdb.Command):
     """Cforall: Display currently known threads
+        """Cforall: Display currently known threads
 Usage:
+    cfathreads                           : print Main Cluster threads, application threads only
+    cfathreads all                       : print all clusters, all threads
+    cfathreads <clusterName>             : print cluster threads, application threads only
+    """
+    def __init__(self):
+        # The first parameter of the line below is the name of the command. You
+        # can call it 'uc++ task'
+        super(Threads, self).__init__('info cfathreads', gdb.COMMAND_USER)
+    def print_formatted(self, marked, tid, name, state, address):
+        print('{:>1}  {:>4}  {:>20}  {:>10}  {:>20}'.format('*' if marked else ' ', tid, name, state, address))
+    def print_thread(self, thread, tid, marked):
+        cfa_t = get_cfa_types()
+        self.print_formatted(marked, tid, thread['self_cor']['name'].string(), str(thread['state'].cast(cfa_t.thread_state)), str(thread))
+    def print_formatted_cluster(self, str_format, cluster_name, cluster_addr):
+        print(str_format.format(cluster_name, cluster_addr))
+    def print_threads_by_cluster(self, cluster, print_system = False):
+        # Iterate through a circular linked list of tasks and print out its
+        # name along with address associated to each cluster
+        threads = lookup_threads_by_cluster(cluster)
+        if not threads:
+            return
+        running_thread = find_curr_thread()
+        if running_thread is None:
+            print('Could not identify current thread')
+        self.print_formatted(False, '', 'Name', 'State', 'Address')
+        for t in threads:
+            if not t.is_system() or print_system:
+                self.print_thread(t.value, t.tid, t.value == running_thread if running_thread else False)
+        print()
+    def print_all_threads(self):
+        print("Not implemented")
+    def invoke(self, arg, from_tty):
+        """
+        @arg: str
+        @from_tty: bool
+        """
+        if not is_cforall():
+            return
+        if not arg:
+            cluster = lookup_cluster()
+            if not cluster:
+                print("Could not find Main Cluster")
+                return
+            # only tasks and main
+            self.print_threads_by_cluster(cluster, False)
+        elif arg == 'all':
+            # all threads, all clusters
+            self.print_all_threads()
+        else:
+            cluster = lookup_cluster(arg)
+            if not cluster:
+                print("Could not find cluster '{}'".format(arg))
+                return
+            # all tasks, specified cluster
+            self.print_threads_by_cluster(cluster, True)
+        cfathreads                           : print Main Cluster threads, application threads only
+        cfathreads all                       : print all clusters, all threads
+        cfathreads <clusterName>             : print cluster threads, application threads only
+        """
+        def __init__(self):
+                # The first parameter of the line below is the name of the command. You
+                # can call it 'uc++ task'
+                super(Threads, self).__init__('info cfathreads', gdb.COMMAND_USER)
+        def print_formatted(self, marked, tid, name, state, address):
+                print('{:>1}  {:>4}  {:>20}  {:>10}  {:>20}'.format('*' if marked else ' ', tid, name, state, address))
+        def print_thread(self, thread, tid, marked):
+                cfa_t = get_cfa_types()
+                self.print_formatted(marked, tid, thread['self_cor']['name'].string(), str(thread['state'].cast(cfa_t.thread_state)), str(thread))
+        def print_threads_by_cluster(self, cluster, print_system = False):
+                # Iterate through a circular linked list of tasks and print out its
+                # name along with address associated to each cluster
+                threads = lookup_threads_by_cluster(cluster)
+                if not threads:
+                        return
+                running_thread = find_curr_thread()
+                if running_thread is None:
+                        print('Could not identify current thread')
+                self.print_formatted(False, '', 'Name', 'State', 'Address')
+                for t in threads:
+                        if not t.is_system() or print_system:
+                                self.print_thread(t.value, t.tid, t.value == running_thread if running_thread else False)
+                print()
+        def print_all_threads(self):
+                for c in all_clusters():
+                        self.print_threads_by_cluster(c, False)
+        def invoke(self, arg, from_tty):
+                """
+                @arg: str
+                @from_tty: bool
+                """
+                if not is_cforall():
+                        return
+                if not arg:
+                        cluster = lookup_cluster()
+                        if not cluster:
+                                print("Could not find Main Cluster")
+                                return
+                        # only tasks and main
+                        self.print_threads_by_cluster(cluster, False)
+                elif arg == 'all':
+                        # all threads, all clusters
+                        self.print_all_threads()
+                else:
+                        cluster = lookup_cluster(arg)
+                        if not cluster:
+                                print("Could not find cluster '{}'".format(arg))
+                                return
+                        # all tasks, specified cluster
+                        self.print_threads_by_cluster(cluster, True)
 ############
 class Thread(gdb.Command):
+    def __init__(self):
+        # The first parameter of the line below is the name of the command. You
+        # can call it 'uc++ task'
+        super(Threads, self).__init__('cfathread', gdb.COMMAND_USER)
+    def print_usage(self):
+        print_usage("""
+    cfathread                            : print userCluster tasks, application tasks only
+    cfathread <clusterName>              : print cluster tasks, application tasks only
+    cfathread all                        : print all clusters, all tasks
+    cfathread <id>                       : switch stack to thread id on userCluster
+    cfathread 0x<address>                    : switch stack to thread on any cluster
+    cfathread <id> <clusterName>         : switch stack to thread on specified cluster
+    """)
+    ############################ AUXILIARY FUNCTIONS #########################
+    def print_formatted(self, marked, tid, name, state, address):
+        print('{:>1}  {:>4}  {:>20}  {:>10}  {:>20}'.format('*' if marked else ' ', tid, name, state, address))
+    def print_thread(self, thread, tid, marked):
+        cfa_t = get_cfa_types()
+        self.print_formatted(marked, tid, thread['self_cor']['name'].string(), str(thread['state'].cast(cfa_t.thread_state)), str(thread))
+    def print_formatted_cluster(self, str_format, cluster_name, cluster_addr):
+        print(str_format.format(cluster_name, cluster_addr))
+    def print_tasks_by_cluster_all(self, cluster_address):
+        """
+        Display a list of all info about all available tasks on a particular cluster
+        @cluster_address: gdb.Value
+        """
+        cluster_address = cluster_address.cast(uCPPTypes.ucluster_ptr)
+        task_root = cluster_address['tasksOnCluster']['root']
+        if task_root == 0x0 or task_root.address == 0x0:
+            print('There are no tasks for cluster at address: {}'.format(cluster_address))
+            return
+        self.print_formatted_task('', 'Task Name', 'Address', 'State')
+        curr = task_root
+        task_id = 0
+        systask_id = -1
+        breakpoint_addr = self.find_curr_breakpoint_addr()
+        if breakpoint_addr is None:
+            return
+        while True:
+            global SysTask_Name
+            if (curr['task_']['name'].string() in SysTask_Name):
+                self.print_formatted_tasks(systask_id, breakpoint_addr, curr)
+                systask_id -= 1
+            else:
+                self.print_formatted_tasks(task_id, breakpoint_addr, curr)
+                task_id += 1
+            curr = curr['next'].cast(uCPPTypes.uBaseTaskDL_ptr_type)
+            if curr == task_root:
+                break
+    def print_tasks_by_cluster_address_all(self, cluster_address):
+        """
+        Display a list of all info about all available tasks on a particular cluster
+        @cluster_address: str
+        """
+        # Iterate through a circular linked list of tasks and print out its
+        # name along with address associated to each cluster
+        # convert hex string to hex number
+        try:
+            hex_addr = int(cluster_address, 16)
+        except:
+            self.print_usage()
+            return
+        cluster_address = gdb.Value(hex_addr)
+        if not self.print_tasks_by_cluster_all(cluster_address):
+            return
+    def print_threads_by_cluster(self, cluster, print_system = False):
+        """
+        Display a list of limited info about all available threads on a particular cluster
+        @cluster: str
+        @print_system: bool
+        """
+        # Iterate through a circular linked list of tasks and print out its
+        # name along with address associated to each cluster
+        threads = self.threads_by_cluster(cluster)
+        if not threads:
+            return
+        running_thread = self.find_curr_thread()
+        if running_thread is None:
+            print('Could not identify current thread')
+        self.print_formatted(False, '', 'Name', 'State', 'Address')
+        for t in threads:
+            if not t.is_system() or print_system:
+                self.print_thread(t.value, t.tid, t.value == running_thread if running_thread else False)
+        print()
+    ############################ COMMAND FUNCTIONS #########################
+    def print_all_threads(self):
+        """Iterate through each cluster, iterate through all tasks and  print out info about all the tasks
+        in those clusters"""
+        uCPPTypes = None
+        try:
+            uCPPTypes = get_uCPP_types()
+        except gdb.error:
+            print(not_supported_error_msg)
+            print(gdb.error)
+            return
+        cluster_root = get_cluster_root()
+        if cluster_root.address == 0x0:
+            return
+        curr = cluster_root
+        self.print_formatted_cluster(self.cluster_str_format, 'Cluster Name', 'Address')
+        while True:
+            addr = str(curr['cluster_'].reference_value())[1:]
+            self.print_formatted_cluster(self.cluster_str_format, curr['cluster_']['name'].string(), addr)
+            self.print_tasks_by_cluster_address_all(addr)
+            curr = curr['next'].cast(uCPPTypes.uClusterDL_ptr_type)
+            if curr == cluster_root:
+                break
+    def switchto(self, thread):
+        """Change to a new task by switching to a different stack and manually
+        adjusting sp, fp and pc
+        @task_address: str
+supported format:
+                in hex format
+                    <hex_address>: literal hexadecimal address
+                    Ex: 0xffffff
+                in name of the pointer to the task
+                    "task_name": pointer of the variable name of the cluster
+                        Ex: T* s -> task_name = s
+            Return: gdb.value of the cluster's address
+        """
+        # uCPPTypes = None
+        # try:
+        #     uCPPTypes = get_uCPP_types()
+        # except gdb.error:
+        #     print(not_supported_error_msg)
+        #     print(gdb.error)
+        #     return
+        # # Task address has a format "task_address", which implies that it is the
+        # # name of the variable, and it needs to be evaluated
+        # if task_address.startswith('"') and task_address.endswith('"'):
+        #     task = gdb.parse_and_eval(task_address.replace('"', ''))
+        # else:
+        # # Task address format does not include the quotation marks, which implies
+        # # that it is a hex address
+        #     # convert hex string to hex number
+        #     try:
+        #         hex_addr = int(task_address, 16)
+        #     except:
+        #         self.print_usage()
+        #         return
+        #     task_address = gdb.Value(hex_addr)
+        #     task = task_address.cast(uCPPTypes.uBaseTask_ptr_type)
+        try:
+            if not gdb.lookup_symbol('__cfactx_switch'):
+                print('__cfactx_switch symbol is unavailable')
+                return
+        except:
+            print('here 3')
+        cfa_t = get_cfa_types()
+        state = thread['state'].cast(cfa_t.thread_state)
+        try:
+            if state == gdb.parse_and_eval('Halted'):
+                print('Cannot switch to a terminated thread')
+                return
+            if state == gdb.parse_and_eval('Start'):
+                print('Cannjot switch to a thread not yet run')
+                return
+        except:
+            print("here 2")
+            return
+        context = thread['context']
+        # lookup for sp,fp and uSwitch
+        xsp = context['SP'] + 48
+        xfp = context['FP']
+        # convert string so we can strip out the address
+        try:
+            xpc = get_addr(gdb.parse_and_eval('__cfactx_switch').address + 28)
+        except:
+            print("here")
+            return
+        # must be at frame 0 to set pc register
+        gdb.execute('select-frame 0')
+        # push sp, fp, pc into a global stack
+        global STACK
+        sp = gdb.parse_and_eval('$sp')
+        fp = gdb.parse_and_eval('$fp')
+        pc = gdb.parse_and_eval('$pc')
+        stack_info = StackInfo(sp = sp, fp = fp, pc = pc)
+        STACK.append(stack_info)
+        # update registers for new task
+        print('switching to ')
+        gdb.execute('set $rsp={}'.format(xsp))
+        gdb.execute('set $rbp={}'.format(xfp))
+        gdb.execute('set $pc={}'.format(xpc))
+    def find_matching_gdb_thread_id():
+        """
+        Parse the str from info thread to get the number
+        """
+        info_thread_str = gdb.execute('info thread', to_string=True).splitlines()
+        for thread_str in info_thread_str:
+            if thread_str.find('this={}'.format(task)) != -1:
+                thread_id_pattern = r'^\*?\s+(\d+)\s+Thread'
+                # retrive gdb thread id
+                return re.match(thread_id_pattern, thread_str).group(1)
+            # check if the task is running or not
+            if task_state == gdb.parse_and_eval('uBaseTask::Running'):
+                # find the equivalent thread from info thread
+                gdb_thread_id = find_matching_gdb_thread_id()
+                if gdb_thread_id is None:
+                    print('cannot find the thread id to switch to')
+                    return
+                # switch to that thread based using thread command
+                gdb.execute('thread {}'.format(gdb_thread_id))
+    def switchto_id(self, tid, cluster):
+        """
+        @cluster: cluster object
+        @tid: int
+        """
+        threads = self.threads_by_cluster( cluster )
+        for t in threads:
+            if t.tid == tid:
+                self.switchto(t.value)
+                return
+        print("Cound not find thread by id '{}'".format(tid))
+    def invoke(self, arg, from_tty):
+        """
+        @arg: str
+        @from_tty: bool
+        """
+        if not is_cforall():
+            return
+        argv = parse(arg)
+        print(argv)
+        if len(argv) == 0:
+            """
+            Iterate only Main Thread, print only tasks and main
+            """
+            cluster = lookup_cluster()
+            if not cluster:
+                print("Could not find Main Cluster")
+                return
+            # only tasks and main
+            self.print_threads_by_cluster(cluster, False)
+        elif len(argv) == 1:
+            if argv[0] == 'help':
+                self.print_usage()
+            # push task
+            elif argv[0].isdigit():
+                cluster = lookup_cluster()
+                if not cluster:
+                    print("Could not find Main Cluster")
+                    return
+                try:
+                    tid = int(argv[0])
+                except:
+                    print("'{}' not a valid thread id".format(argv[0]))
+                    self.print_usage()
+                    return
+                 # by id, userCluster
+                self.switchto_id(tid, cluster)
+            elif argv[0].startswith('0x') or argv[0].startswith('0X'):
+                self.switchto(argv[0]) # by address, any cluster
+            # print tasks
+            elif argv[0] == 'all':
+                self.print_all_threads() # all tasks, all clusters
+            else:
+                """
+                Print out all the tasks available in the specified cluster
+                @cluster_name: str
+                """
+                print("cfathread by name")
+                cluster = lookup_cluster(argv[0])
+                if not cluster:
+                    return
+                # all tasks, specified cluster
+                self.print_threads_by_cluster(cluster, True)
+        elif len(argv) == 2:
+            # push task
+            self.pushtask_by_id(argv[0], argv[1]) # by id, specified cluster
+        else:
+            print('Invalid arguments')
+            self.print_usage()
+        """Cforall: Switch to specified user threads
+Usage:
+        cfathread <id>                       : switch stack to thread id on main cluster
+        cfathread 0x<address>                : switch stack to thread on any cluster
+        cfathread <id> <clusterName>         : switch stack to thread on specified cluster
+        """
+        def __init__(self):
+                # The first parameter of the line below is the name of the command. You
+                # can call it 'uc++ task'
+                super(Thread, self).__init__('cfathread', gdb.COMMAND_USER)
+        ############################ AUXILIARY FUNCTIONS #########################
+        def switchto(self, thread):
+                """Change to a new task by switching to a different stack and manually
+                adjusting sp, fp and pc
+                @task_address: str
+supported format:
+                                in hex format
+                                        <hex_address>: literal hexadecimal address
+                                        Ex: 0xffffff
+                                in name of the pointer to the task
+                                        "task_name": pointer of the variable name of the cluster
+                                                Ex: T* s -> task_name = s
+                        Return: gdb.value of the cluster's address
+                """
+                try:
+                        if not gdb.lookup_symbol('__cfactx_switch'):
+                                print('__cfactx_switch symbol is unavailable')
+                                return
+                except:
+                        print('here 3')
+                cfa_t = get_cfa_types()
+                state = thread['state'].cast(cfa_t.thread_state)
+                try:
+                        if state == gdb.parse_and_eval('Halted'):
+                                print('Cannot switch to a terminated thread')
+                                return
+                        if state == gdb.parse_and_eval('Start'):
+                                print('Cannjot switch to a thread not yet run')
+                                return
+                except:
+                        print("here 2")
+                        return
+                context = thread['context']
+                # lookup for sp,fp and uSwitch
+                xsp = context['SP'] + 48
+                xfp = context['FP']
+                # convert string so we can strip out the address
+                try:
+                        xpc = get_addr(gdb.parse_and_eval('__cfactx_switch').address + 28)
+                except:
+                        print("here")
+                        return
+                # must be at frame 0 to set pc register
+                gdb.execute('select-frame 0')
+                # push sp, fp, pc into a global stack
+                global STACK
+                sp = gdb.parse_and_eval('$sp')
+                fp = gdb.parse_and_eval('$fp')
+                pc = gdb.parse_and_eval('$pc')
+                stack_info = StackInfo(sp = sp, fp = fp, pc = pc)
+                STACK.append(stack_info)
+                # update registers for new task
+                print('switching to ')
+                gdb.execute('set $rsp={}'.format(xsp))
+                gdb.execute('set $rbp={}'.format(xfp))
+                gdb.execute('set $pc={}'.format(xpc))
+        def find_matching_gdb_thread_id():
+                """
+                Parse the str from info thread to get the number
+                """
+                info_thread_str = gdb.execute('info thread', to_string=True).splitlines()
+                for thread_str in info_thread_str:
+                        if thread_str.find('this={}'.format(task)) != -1:
+                                thread_id_pattern = r'^\*?\s+(\d+)\s+Thread'
+                                # retrive gdb thread id
+                                return re.match(thread_id_pattern, thread_str).group(1)
+                        # check if the task is running or not
+                        if task_state == gdb.parse_and_eval('uBaseTask::Running'):
+                                # find the equivalent thread from info thread
+                                gdb_thread_id = find_matching_gdb_thread_id()
+                                if gdb_thread_id is None:
+                                        print('cannot find the thread id to switch to')
+                                        return
+                                # switch to that thread based using thread command
+                                gdb.execute('thread {}'.format(gdb_thread_id))
+        def switchto_id(self, tid, cluster):
+                """
+                @cluster: cluster object
+                @tid: int
+                """
+                threads = lookup_threads_by_cluster( cluster )
+                for t in threads:
+                        if t.tid == tid:
+                                self.switchto(t.value)
+                                return
+                print("Cound not find thread by id '{}'".format(tid))
+        def invoke(self, arg, from_tty):
+                """
+                @arg: str
+                @from_tty: bool
+                """
+                if not is_cforall():
+                        return
+                argv = parse(arg)
+                print(argv)
+                if argv[0].isdigit():
+                        cname = " ".join(argv[1:]) if len(argv) > 1 else None
+                        cluster = lookup_cluster(cname)
+                        if not cluster:
+                                print("Could not find cluster '{}'".format(cname if cname else "Main Cluster"))
+                                return
+                        try:
+                                tid = int(argv[0])
+                        except:
+                                print("'{}' not a valid thread id".format(argv[0]))
+                                print_usage(self)
+                                return
+                                # by id, userCluster
+                        self.switchto_id(tid, cluster)
+                elif argv[0].startswith('0x') or argv[0].startswith('0X'):
+                        self.switchto(argv[0]) # by address, any cluster
 ############
 class PrevThread(gdb.Command):
     """Switch back to previous task on the stack"""
     usage_msg = 'prevtask'
     def __init__(self):
         super(PrevThread, self).__init__('prevtask', gdb.COMMAND_USER)
     def invoke(self, arg, from_tty):
         """
         @arg: str
         @from_tty: bool
         """
         global STACK
         if len(STACK) != 0:
             # must be at frame 0 to set pc register
             gdb.execute('select-frame 0')
             # pop stack
             stack_info = STACK.pop()
             pc = get_addr(stack_info.pc)
             sp = stack_info.sp
             fp = stack_info.fp
             # pop sp, fp, pc from global stack
             adjust_stack(pc, fp, sp)
             # must be at C++ frame to access C++ vars
             gdb.execute('frame 1')
         else:
             print('empty stack')
+        """Switch back to previous task on the stack"""
+        usage_msg = 'prevtask'
+        def __init__(self):
+                super(PrevThread, self).__init__('prevtask', gdb.COMMAND_USER)
+        def invoke(self, arg, from_tty):
+                """
+                @arg: str
+                @from_tty: bool
+                """
+                global STACK
+                if len(STACK) != 0:
+                        # must be at frame 0 to set pc register
+                        gdb.execute('select-frame 0')
+                        # pop stack
+                        stack_info = STACK.pop()
+                        pc = get_addr(stack_info.pc)
+                        sp = stack_info.sp
+                        fp = stack_info.fp
+                        # pop sp, fp, pc from global stack
+                        adjust_stack(pc, fp, sp)
+                        # must be at C++ frame to access C++ vars
+                        gdb.execute('frame 1')
+                else:
+                        print('empty stack')
 class ResetOriginFrame(gdb.Command):
     """Reset to the origin frame prior to continue execution again"""
     usage_msg = 'resetOriginFrame'
     def __init__(self):
         super(ResetOriginFrame, self).__init__('reset', gdb.COMMAND_USER)
     def invoke(self, arg, from_tty):
         """
         @arg: str
         @from_tty: bool
         """
         global STACK
         if len(STACK) != 0:
             stack_info = STACK.pop(0)
             STACK.clear()
             pc = get_addr(stack_info.pc)
             sp = stack_info.sp
             fp = stack_info.fp
             # pop sp, fp, pc from global stack
             adjust_stack(pc, fp, sp)
             # must be at C++ frame to access C++ vars
             gdb.execute('frame 1')
         #else:
             #print('reset: empty stack') #probably does not have to print msg
+        """Reset to the origin frame prior to continue execution again"""
+        usage_msg = 'resetOriginFrame'
+        def __init__(self):
+                super(ResetOriginFrame, self).__init__('reset', gdb.COMMAND_USER)
+        def invoke(self, arg, from_tty):
+                """
+                @arg: str
+                @from_tty: bool
+                """
+                global STACK
+                if len(STACK) != 0:
+                        stack_info = STACK.pop(0)
+                        STACK.clear()
+                        pc = get_addr(stack_info.pc)
+                        sp = stack_info.sp
+                        fp = stack_info.fp
+                        # pop sp, fp, pc from global stack
+                        adjust_stack(pc, fp, sp)
+                        # must be at C++ frame to access C++ vars
+                        gdb.execute('frame 1')
+                #else:
+                        #print('reset: empty stack') #probably does not have to print msg
 Clusters()
 …
 PrevThread()
 Threads()
+Thread()
 # Local Variables: #

Context Navigation

Legend:

Download in other formats: