\documentclass[english,aspectratio=169,svgnames,notes=hide,14pt,xcolor={dvipsnames}]{beamer} \usepackage{graphicx} \usepackage{epic,eepic} \usepackage{presentationstyle} \title{The \CFA Scheduler} \subtitle{PhD Comprehensive II Research Proposal} \author{Thierry Delisle} % \affil[1]{School of Computer Science, University of Waterloo} % \affil[ ]{\textit {tdelisle@uwaterloo.ca}} \begin{document} %============================== \miniframesoff \begin{frame}[noframenumbering,plain] \titlepage \end{frame} %============================== \section{Introduction} %------------------------------ \begin{frame}[noframenumbering] \tableofcontents \end{frame} \miniframeson %============================== \AtBeginSection[]{ \miniframesoff \begin{frame} \vfill \centering \begin{beamercolorbox}[sep=8pt,center,shadow=false,rounded=false]{title} \usebeamerfont{title}\insertsectionhead\par% \end{beamercolorbox} \vfill \end{frame} \miniframeson } \section{\CFA and Concurrency} \begin{frame}{\CFA} \end{frame} %------------------------------ \begin{frame}{Concurrency in \CFA} User Level Threading \begin{itemize} \item M:N threading. \item User Level Context Switching causes kernel-threads to run a different user-thread. \end{itemize} ~\newline Threads organized in clusters: \begin{itemize} \item Clusters have their own kernel threads. \item Threads in a cluster are on run on the kernel threads of that cluster. \end{itemize} \end{frame} %------------------------------ \begin{frame}{Concurrency in \CFA} \begin{table} {\resizebox{1\textwidth}{!}{\input{system.dark.pstex_t}}} \end{table} \end{frame} %------------------------------ \begin{frame}{Scheduling goal for \CFA} {\large \begin{center} The \CFA scheduler should be \textit{viable} for any workload. \end{center} } ~\newline This implies: \begin{enumerate} \item Producing a scheduler with sufficient fairness guarantees. \item Handling kernel-threads running out of work. \item Handling blocking I/O operations. \end{enumerate} \end{frame} %============================== \section{Scheduling in Practice} %------------------------------ \begin{frame}{In the Wild} Schedulers found in production application generally fall into two categories: \newline \begin{itemize} \item Feedback Scheduling\newline \item Priority Scheduling (explicit or not)\newline \end{itemize} \end{frame} %------------------------------ \begin{frame}{Feedback Scheduling} Most operating systems based their scheduling on feedback loops. ~\newline The scheduler runs a thread and adjusts some metric to choose when to run it, e.g., least CPU time first. ~\newline Relies on the following assumptions: \begin{enumerate} \item Threads live long enough for useful feedback information to be to gathered. \item Threads belong to multiple users so fairness across threads is insufficient. \end{enumerate} \end{frame} %------------------------------ \begin{frame}{Priority Scheduling} \begin{center} {\large Runs all ready threads in group \textit{A} before any ready threads in group \textit{B}. } \end{center} \vspace{1em} Explicit priorities: \begin{itemize} \item Threads given a priority at creation, e.g., Thread A has priority 1, Thread B has priority 6. \end{itemize} \vspace{0.75em} Implicit priorities: \begin{itemize} \item Certain threads are preferred, based on various metrics, e.g., last run, last run on this CPU. \end{itemize} \end{frame} %------------------------------ \begin{frame}{Priority Scheduling: Work-Stealing} Work-Stealing is a very popular strategy. \begin{block}{Algorithm} \begin{enumerate} \item Each processor has a list of ready threads. \item Each processor runs threads from its ready queue first. \item If a processor's ready queue is empty, attempt to run threads from some other processor's ready queue. \end{enumerate} \end{block} ~ Work-Stealing has implicit priorities: For a given processor, threads on it's queue have higher priority. Processors begin busy for long periods can mean starvation. \end{frame} %============================== \section{Project: Proposal \& Details} %------------------------------ \begin{frame}{Central Ready-Queue} \CFA will have a single ready-queue per cluster. \newline The ready-queue will be sharded internally to reduce contention. \newline No strong coupling between internal queues and processors. \newline Constrasts with work-stealing which has a queue per processor. \newline \end{frame} %------------------------------ \begin{frame}{Central Ready-Queue} \begin{table} {\resizebox{0.8\textwidth}{!}{\input{base.dark.pstex_t}}} \end{table} ~ \end{frame} %------------------------------ \begin{frame}{Central Ready-Queue Challenges} \begin{columns} \begin{column}{0.55\textwidth} Semi-``Empty'' ready-queues means success rate of randomly guessing goes down. \end{column} \begin{column}{0.45\textwidth} \begin{table} {\resizebox{1\textwidth}{!}{\input{empty.dark.pstex_t}}} \end{table} \end{column} \end{columns} Possible solutions: \begin{itemize} \item Data structure tracking the work, can be dense or sparse, global or sharded. \item Add bias towards certain sub-queues. \end{itemize} \end{frame} %------------------------------ \begin{frame}{Dynamic Resizing} Processors can be added at anytime on a cluster. \newline The array of queues needs to be adjusted in consequence. \newline Solution: Global Reader-Writer lock \begin{itemize} \item Acquire for reading for normal scheduling operations. \item Acquire for right when resizing the array and creating/deleting internal queues. \end{itemize} \end{frame} %------------------------------ \begin{frame}{Idle Sleep} Processors which cannot find threads to run should sleep, using \texttt{pthread\_cond\_wait}, \texttt{sigwaitinfo}, etc. \newline Scheduling a thread may \textit{need} to wake sleeping processors. \begin{itemize} \item Threads can be scheduled from processors terminating or running outside the cluster. In this case, all processors on the cluster could be sleeping. \end{itemize} ~ If \textit{some} processors are sleeping, waking more may be wasteful. A heuristic for this case is outside the scope of this project. \end{frame} %------------------------------ \begin{frame}{Asynchronous I/O} \begin{itemize} \item I/O Operations should block user-threads rather than kernel-threads. \vspace{1cm} \item This requires 3 components: \begin{enumerate} \item an OS abstraction layer over the asynchronous interface, \vspace{0.2cm} \item an event-engine to (de)multiplex the operations, \vspace{0.2cm} \item and a synchronous interface for users to use. \vspace{0.2cm} \end{enumerate} \end{itemize} \end{frame} %------------------------------ \begin{frame}{Asynchronous I/O: OS Abstraction} \framesubtitle{\vskip0.5mm\large\texttt{select}} \vskip5mm {\large ``select() allows a program to monitor multiple file descriptors, waiting until one or more of the file descriptors become ``ready'' for some class of I/O operation.''} \hspace*\fill{\small--- Linux Programmer's Manual} \vskip5mm \begin{itemize} \item[+] moderate overhead per \texttt{syscall} \item[-] Relies on \texttt{syscall}s returning \texttt{EWOULDBLOCK}. \end{itemize} \end{frame} %------------------------------ \begin{frame}{Asynchronous I/O: OS Abstraction} \framesubtitle{\vskip0.5mm\large\texttt{epoll}} \vskip2mm More recent system call with a similar purpose. \vskip5mm \begin{itemize} \item[+] Smaller overhead per \texttt{syscall}. \item[+] Shown to work well for sockets. \item[-] Still relies on \texttt{syscall}s returning \texttt{EWOULDBLOCK}. \item[-] Does not support linux pipes and TTYs. \end{itemize} \end{frame} %------------------------------ \begin{frame}{Asynchronous I/O: OS Abstraction} \framesubtitle{\vskip0.5mm\large Kernel Threads} \vskip2mm Use a pool of kernel-threads, to which blocking calls are delegated. \vskip5mm \begin{itemize} \item Technique used by many existing systems, e.g., Go, libuv \item[+] Definitely works for all \texttt{syscall}s. \item[$-$] Can require many kernel threads. \end{itemize} \end{frame} %------------------------------ \begin{frame}{Asynchronous I/O: OS Abstraction} \framesubtitle{\vskip0.5mm\large\texttt{io\_uring}} \vskip2mm A very recent framework for asynchronous operations available in Linux 5.1 and later. Uses two ring buffers to submit operations and poll completions. \vskip5mm \begin{itemize} \item[+] Handles many \texttt{syscall}s. \item[+] Does \textit{not} rely on \texttt{syscall}s returning \texttt{EWOULDBLOCK}. \item[$-$] Requires synchronization on submission. \item[$-$] System call itself is serialized in the kernel. \end{itemize} \end{frame} %------------------------------ \begin{frame}{Asynchronous I/O: Event Engine} An event engine must be built to fit the chosen OS Abstraction. \newline The engine must park user-threads until operation is completed. \newline Depending on the chosen abstraction the engine may need to serialize operation submission. \newline Throughput and latency are important metrics. \end{frame} %------------------------------ \begin{frame}{Asynchronous I/O: The interface} The Asynchronous I/O needs an interface. \newline Several options to take into consideration: \begin{itemize} \item Adding to existing call interface, e.g., \texttt{read} and \texttt{cfaread}. \vspace{0.2cm} \item Replacing existing call interface. \vspace{0.2cm} \item True asynchronous interface, e.g., callbacks, futures. \vspace{0.2cm} \end{itemize} \end{frame} %============================== \section{Conclusion} %------------------------------ \begin{frame}{Summary} Runtime system and scheduling are still open topics. \newline This work offers a novel runtime and scheduling package. \newline Existing work only offers fragments that users must assemble themselves when possible. \end{frame} %------------------------------ \begin{frame}{Timeline} \begin{tabular}{ || m{0.1mm} m{0.75cm} m{1cm} | l } % \hline \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small May Oct} & {\small 2020 2020} & Creation of the performance benchmark. \\ \hline \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small Nov Mar} & {\small 2020 2021} & Completion of the implementation. \\ \hline \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small Mar Apr} & {\small 2021 2021} & Final performance experiments. \\ \hline \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small May Aug} & {\small 2021 2021} & Thesis writing and defense. \\ % \hline \end{tabular} \end{frame} %------------------------------ \begin{frame}{Timeline} \begin{center} {\large Questions?} \end{center} \end{frame} \end{document}