\documentclass[english,aspectratio=169,svgnames,notes=hide,14pt,xcolor={dvipsnames}]{beamer}
\usepackage{graphicx}
\usepackage{epic,eepic}
\usepackage{presentationstyle}

\title{The \CFA Scheduler}
\subtitle{PhD Comprehensive II Research Proposal}
\author{Thierry Delisle}
% \affil[1]{School of Computer Science, University of Waterloo}
% \affil[ ]{\textit {tdelisle@uwaterloo.ca}}

\begin{document}
%==============================
\miniframesoff
\begin{frame}[noframenumbering,plain]
	\titlepage
\end{frame}
%==============================
\section{Introduction}
%------------------------------
\begin{frame}[noframenumbering]
	\tableofcontents
\end{frame}
\miniframeson
%==============================
\AtBeginSection[]{
	\miniframesoff
	\begin{frame}
	\vfill
	\centering
	\begin{beamercolorbox}[sep=8pt,center,shadow=false,rounded=false]{title}
		\usebeamerfont{title}\insertsectionhead\par%
	\end{beamercolorbox}
	\vfill
	\end{frame}
	\miniframeson
}
\section{Concurrency and \CFA}
\begin{frame}{Project}
	\begin{center}
		{\large Produce a scheduler for \CFA that is simple for programmers to understand and offers good general performance.}
	\end{center}
\end{frame}
%------------------------------
\begin{frame}{\CFA}
	\CFA is a modern extension of C.
	It adds to C : overloading, constructors/destructors, polymorphism, and much more.

	~\newline
	For this project, the relevant aspects are:
	\begin{itemize}
		\item Fast and safe system language.
		\item Threading.
		\item Manual memory management.
	\end{itemize}

\end{frame}
%------------------------------
\begin{frame}{Concurrency in \CFA}
	User Level Threading
	\begin{itemize}
		\item M:N threading.
		\item User Level Context Switching causes kernel-threads to run a different user-thread.
	\end{itemize}
	~\newline
	Threads organized in clusters:
	\begin{itemize}
		\item Clusters have their own kernel threads.
		\item Threads in a cluster are on run on the kernel threads of that cluster.
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Concurrency in \CFA}
	\begin{table}
		{\resizebox{1\textwidth}{!}{\input{system.dark.pstex_t}}}
	\end{table}
\end{frame}
%------------------------------
\begin{frame}{Scheduling goal for \CFA}
	{\large
		\begin{center}
			The \CFA scheduler should be \textit{viable} for any workload.
		\end{center}
	}
	~\newline
	This implies:
	\begin{enumerate}
		\item Producing a scheduler with sufficient fairness guarantees.
		\item Handling kernel-threads running out of work.
		\item Handling blocking I/O operations.
	\end{enumerate}
\end{frame}
%==============================
\section{Scheduling in Practice}
%------------------------------
\begin{frame}{In the Wild}
	Schedulers found in production application generally fall into two categories:
	\newline

	\begin{itemize}
		\item Feedback Scheduling\newline
		\item Priority Scheduling (explicit or not)\newline
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Feedback Scheduling}
	Most operating systems based their scheduling on feedback loops.
	~\newline

	The scheduler runs a thread and adjusts some metric to choose when to run it, e.g., least CPU time first.
	~\newline

	Relies on the following assumptions:
	\begin{enumerate}
		\item Threads live long enough for useful feedback information to be to gathered.
		\item Threads belong to multiple users so fairness across threads is insufficient.
	\end{enumerate}
\end{frame}
%------------------------------
\begin{frame}{Priority Scheduling}
	\begin{center}
		{\large
			Runs all ready threads in group \textit{A} before any ready threads in group \textit{B}.
		}
	\end{center}
	\vspace{1em}

	Explicit priorities:
	\begin{itemize}
		\item Threads given a priority at creation, e.g., Thread A has priority 1, Thread B has priority 6.
	\end{itemize}
	\vspace{0.75em}

	Implicit priorities:
	\begin{itemize}
		\item Certain threads are preferred, based on various metrics, e.g., last run, last run on this CPU.
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Priority Scheduling: Work-Stealing}
	Work-Stealing is a very popular strategy.
	\begin{block}{Algorithm}
		\begin{enumerate}
			\item Each processor has a list of ready threads.
			\item Each processor runs threads from its ready queue first.
			\item If a processor's ready queue is empty, attempt to run threads from some other processor's ready queue.
		\end{enumerate}
	\end{block}
	~

	Work-Stealing has implicit priorities: For a given processor, threads on it's queue have higher priority.

	Processors begin busy for long periods can mean starvation.
\end{frame}
%------------------------------
\begin{frame}{Scheduling in Practice: Summary}
	\begin{columns}
		\begin{column}{0.5\textwidth}
			\textbf{Feedback Scheduling}
			\newline

			\begin{itemize}
				\item Inappropriate for short lived threads.
				\item Overkill for cooperating threads.\newline
			\end{itemize}
		\end{column}
		\begin{column}{0.5\textwidth}
			\textbf{Priority Scheduling}
			\newline

			\begin{itemize}
				\item Allows lasting starvation.\newline
				\item Hard to reason about.\newline~\newline
			\end{itemize}
		\end{column}
	\end{columns}

	~\newline
	~\newline
	\CFA would benefit from something different.
\end{frame}
%==============================
\section{Project: Proposal \& Details}
%------------------------------
\begin{frame}{Central Ready-Queue}
	\CFA will have a single ready-queue per cluster.
	\newline

	The ready-queue will be sharded internally to reduce contention.
	\newline

	No strong coupling between internal queues and processors.
	\newline

	Constrasts with work-stealing which has a queue per processor.
	\newline
\end{frame}
%------------------------------
\begin{frame}{Central Ready-Queue}
	\begin{table}
		{\resizebox{0.8\textwidth}{!}{\input{base.dark.pstex_t}}}
	\end{table}
	~
\end{frame}
%------------------------------
\begin{frame}{Central Ready-Queue Challenges}
	\begin{columns}
		\begin{column}{0.55\textwidth}
			Semi-``Empty'' ready-queues means success rate of randomly guessing goes down.
		\end{column}
		\begin{column}{0.45\textwidth}
			\begin{table}
				{\resizebox{1\textwidth}{!}{\input{empty.dark.pstex_t}}}
			\end{table}
		\end{column}
	\end{columns}

	Possible solutions:
	\begin{itemize}
		\item Data structure tracking the work, can be dense or sparse, global or sharded.
		\item Add bias towards certain sub-queues.
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Dynamic Resizing}
	Processors can be added at anytime on a cluster.
	\newline

	The array of queues needs to be adjusted in consequence.
	\newline

	Solution: Global Reader-Writer lock
	\begin{itemize}
		\item Acquire for reading for normal scheduling operations.
		\item Acquire for writing when resizing the array and creating/deleting internal queues.
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Idle Sleep}
	Processors which cannot find threads to run should sleep, using \texttt{pthread\_cond\_wait}, \texttt{sigwaitinfo}, etc.
	\newline

	Scheduling a thread may \textit{need} to wake sleeping processors.
	\begin{itemize}
		\item Threads can be scheduled from processors terminating or running outside the cluster.
		In this case, all processors on the cluster could be sleeping.
	\end{itemize}
	~

	If \textit{some} processors are sleeping, waking more may be wasteful.

	A heuristic for this case is outside the scope of this project.
\end{frame}
%------------------------------
\begin{frame}{Asynchronous I/O}
	\begin{itemize}
		\item I/O Operations should block user-threads rather than kernel-threads. \vspace{1cm}
		\item This requires 3 components:
		\begin{enumerate}
			\item an OS abstraction layer over the asynchronous interface,  \vspace{0.2cm}
			\item an event-engine to (de)multiplex the operations,  \vspace{0.2cm}
			\item and a synchronous interface for users to use.  \vspace{0.2cm}
		\end{enumerate}
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Asynchronous I/O: OS Abstraction}
	\framesubtitle{\vskip0.5mm\large\texttt{select}}
	\vskip5mm

	{\large ``select() allows a program to monitor multiple file descriptors, waiting until one
       or more of the file descriptors become ``ready'' for some class of I/O operation.''}

	\hspace*\fill{\small--- Linux Programmer's Manual}

	\vskip5mm
	\begin{itemize}
		\item[+] moderate overhead per \texttt{syscall}
		\item[-] Relies on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Asynchronous I/O: OS Abstraction}
	\framesubtitle{\vskip0.5mm\large\texttt{epoll}}
	\vskip2mm

	More recent system call with a similar purpose.

	\vskip5mm
	\begin{itemize}
		\item[+] Smaller overhead per \texttt{syscall}.
		\item[+] Shown to work well for sockets.
		\item[-] Still relies on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
		\item[-] Does not support linux pipes and TTYs.
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Asynchronous I/O: OS Abstraction}
	\framesubtitle{\vskip0.5mm\large Kernel Threads}
	\vskip2mm

	Use a pool of kernel-threads, to which blocking calls are delegated.

	\vskip5mm
	\begin{itemize}
		\item Technique used by many existing systems, e.g., Go, libuv
		\item[+] Definitely works for all \texttt{syscall}s.
		\item[$-$] Can require many kernel threads.
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Asynchronous I/O: OS Abstraction}
	\framesubtitle{\vskip0.5mm\large\texttt{io\_uring}}
	\vskip2mm

	A very recent framework for asynchronous operations available in Linux 5.1 and later.
	Uses two ring buffers to submit operations and poll completions.

	\vskip5mm
	\begin{itemize}
		\item[+] Handles many \texttt{syscall}s.
		\item[+] Does \textit{not} rely on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
		\item[$-$] Requires synchronization on submission.
		\item[$-$] System call itself is serialized in the kernel.
	\end{itemize}
\end{frame}
%------------------------------
\begin{frame}{Asynchronous I/O: Event Engine}
	An event engine must be built to fit the chosen OS Abstraction.
	\newline

	The engine must park user-threads until operation is completed.
	\newline

	Depending on the chosen abstraction the engine may need to serialize operation submission.
	\newline

	Throughput and latency are important metrics.
\end{frame}
%------------------------------
\begin{frame}{Asynchronous I/O: The interface}
	The Asynchronous I/O needs an interface.
	\newline

	Several options to take into consideration:
	\begin{itemize}
		\item Adding to existing call interface, e.g., \texttt{read} and \texttt{cfaread}.  \vspace{0.2cm}
		\item Replacing existing call interface.  \vspace{0.2cm}
		\item True asynchronous interface, e.g., callbacks, futures.  \vspace{0.2cm}
	\end{itemize}

\end{frame}
%==============================
\section{Conclusion}
%------------------------------
\begin{frame}{Summary}
	Runtime system and scheduling are still open topics.
	\newline
	\newline

	This work offers a novel runtime and scheduling package.
	\newline
	\newline

	Existing work only offers fragments that users must assemble themselves when possible.
\end{frame}
%------------------------------
\begin{frame}{Timeline}
	\begin{tabular}{ || m{0.1mm} m{0.75cm} m{1cm} | l }
		% \hline
		\phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small May Oct} & {\small 2020 2020} & Creation of the performance benchmark. \\
		\hline
		\phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small Nov Mar} & {\small 2020 2021} & Completion of the implementation. \\
		\hline
		\phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small Mar Apr} & {\small 2021 2021} & Final performance experiments. \\
		\hline
		\phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small May Aug} & {\small 2021 2021} & Thesis writing and defense. \\
		% \hline
	\end{tabular}
\end{frame}

%------------------------------
\begin{frame}{}
	\begin{center}
		{\large Questions?}
	\end{center}
\end{frame}
\end{document}