source: doc/theses/thierry_delisle_PhD/comp_II/presentation.tex @ 7f5683e

ADTarm-ehast-experimentalenumforall-pointer-decayjacob/cs343-translationnew-ast-unique-exprpthread-emulationqualifiedEnum
Last change on this file since 7f5683e was 33c3ded, checked in by Thierry Delisle <tdelisle@…>, 4 years ago

Final presented version

  • Property mode set to 100644
File size: 11.8 KB
Line 
1\documentclass[english,aspectratio=169,svgnames,notes=hide,14pt,xcolor={dvipsnames}]{beamer}
2\usepackage{graphicx}
3\usepackage{epic,eepic}
4\usepackage{presentationstyle}
5
6\title{The \CFA Scheduler}
7\subtitle{PhD Comprehensive II Research Proposal}
8\author{Thierry Delisle}
9% \affil[1]{School of Computer Science, University of Waterloo}
10% \affil[ ]{\textit {tdelisle@uwaterloo.ca}}
11
12\begin{document}
13%==============================
14\miniframesoff
15\begin{frame}[noframenumbering,plain]
16        \titlepage
17\end{frame}
18%==============================
19\section{Introduction}
20%------------------------------
21\begin{frame}[noframenumbering]
22        \tableofcontents
23\end{frame}
24\miniframeson
25%==============================
26\AtBeginSection[]{
27        \miniframesoff
28        \begin{frame}
29        \vfill
30        \centering
31        \begin{beamercolorbox}[sep=8pt,center,shadow=false,rounded=false]{title}
32                \usebeamerfont{title}\insertsectionhead\par%
33        \end{beamercolorbox}
34        \vfill
35        \end{frame}
36        \miniframeson
37}
38\section{Concurrency and \CFA}
39\begin{frame}{Project}
40        \begin{center}
41                {\large Produce a scheduler for \CFA that is simple for programmers to understand and offers good general performance.}
42        \end{center}
43\end{frame}
44%------------------------------
45\begin{frame}{\CFA}
46        \CFA is a modern extension of C.
47        It adds to C : overloading, constructors/destructors, polymorphism, and much more.
48
49        ~\newline
50        For this project, the relevant aspects are:
51        \begin{itemize}
52                \item Fast and safe system language.
53                \item Threading.
54                \item Manual memory management.
55        \end{itemize}
56
57\end{frame}
58%------------------------------
59\begin{frame}{Concurrency in \CFA}
60        User Level Threading
61        \begin{itemize}
62                \item M:N threading.
63                \item User Level Context Switching causes kernel-threads to run a different user-thread.
64        \end{itemize}
65        ~\newline
66        Threads organized in clusters:
67        \begin{itemize}
68                \item Clusters have their own kernel threads.
69                \item Threads in a cluster are on run on the kernel threads of that cluster.
70        \end{itemize}
71\end{frame}
72%------------------------------
73\begin{frame}{Concurrency in \CFA}
74        \begin{table}
75                {\resizebox{1\textwidth}{!}{\input{system.dark.pstex_t}}}
76        \end{table}
77\end{frame}
78%------------------------------
79\begin{frame}{Scheduling goal for \CFA}
80        {\large
81                \begin{center}
82                        The \CFA scheduler should be \textit{viable} for any workload.
83                \end{center}
84        }
85        ~\newline
86        This implies:
87        \begin{enumerate}
88                \item Producing a scheduler with sufficient fairness guarantees.
89                \item Handling kernel-threads running out of work.
90                \item Handling blocking I/O operations.
91        \end{enumerate}
92\end{frame}
93%==============================
94\section{Scheduling in Practice}
95%------------------------------
96\begin{frame}{In the Wild}
97        Schedulers found in production application generally fall into two categories:
98        \newline
99
100        \begin{itemize}
101                \item Feedback Scheduling\newline
102                \item Priority Scheduling (explicit or not)\newline
103        \end{itemize}
104\end{frame}
105%------------------------------
106\begin{frame}{Feedback Scheduling}
107        Most operating systems based their scheduling on feedback loops.
108        ~\newline
109
110        The scheduler runs a thread and adjusts some metric to choose when to run it, e.g., least CPU time first.
111        ~\newline
112
113        Relies on the following assumptions:
114        \begin{enumerate}
115                \item Threads live long enough for useful feedback information to be to gathered.
116                \item Threads belong to multiple users so fairness across threads is insufficient.
117        \end{enumerate}
118\end{frame}
119%------------------------------
120\begin{frame}{Priority Scheduling}
121        \begin{center}
122                {\large
123                        Runs all ready threads in group \textit{A} before any ready threads in group \textit{B}.
124                }
125        \end{center}
126        \vspace{1em}
127
128        Explicit priorities:
129        \begin{itemize}
130                \item Threads given a priority at creation, e.g., Thread A has priority 1, Thread B has priority 6.
131        \end{itemize}
132        \vspace{0.75em}
133
134        Implicit priorities:
135        \begin{itemize}
136                \item Certain threads are preferred, based on various metrics, e.g., last run, last run on this CPU.
137        \end{itemize}
138\end{frame}
139%------------------------------
140\begin{frame}{Priority Scheduling: Work-Stealing}
141        Work-Stealing is a very popular strategy.
142        \begin{block}{Algorithm}
143                \begin{enumerate}
144                        \item Each processor has a list of ready threads.
145                        \item Each processor runs threads from its ready queue first.
146                        \item If a processor's ready queue is empty, attempt to run threads from some other processor's ready queue.
147                \end{enumerate}
148        \end{block}
149        ~
150
151        Work-Stealing has implicit priorities: For a given processor, threads on it's queue have higher priority.
152
153        Processors begin busy for long periods can mean starvation.
154\end{frame}
155%------------------------------
156\begin{frame}{Scheduling in Practice: Summary}
157        \begin{columns}
158                \begin{column}{0.5\textwidth}
159                        \textbf{Feedback Scheduling}
160                        \newline
161
162                        \begin{itemize}
163                                \item Inappropriate for short lived threads.
164                                \item Overkill for cooperating threads.\newline
165                        \end{itemize}
166                \end{column}
167                \begin{column}{0.5\textwidth}
168                        \textbf{Priority Scheduling}
169                        \newline
170
171                        \begin{itemize}
172                                \item Allows lasting starvation.\newline
173                                \item Hard to reason about.\newline~\newline
174                        \end{itemize}
175                \end{column}
176        \end{columns}
177
178        ~\newline
179        ~\newline
180        \CFA would benefit from something different.
181\end{frame}
182%==============================
183\section{Project: Proposal \& Details}
184%------------------------------
185\begin{frame}{Central Ready-Queue}
186        \CFA will have a single ready-queue per cluster.
187        \newline
188
189        The ready-queue will be sharded internally to reduce contention.
190        \newline
191
192        No strong coupling between internal queues and processors.
193        \newline
194
195        Constrasts with work-stealing which has a queue per processor.
196        \newline
197\end{frame}
198%------------------------------
199\begin{frame}{Central Ready-Queue}
200        \begin{table}
201                {\resizebox{0.8\textwidth}{!}{\input{base.dark.pstex_t}}}
202        \end{table}
203        ~
204\end{frame}
205%------------------------------
206\begin{frame}{Central Ready-Queue Challenges}
207        \begin{columns}
208                \begin{column}{0.55\textwidth}
209                        Semi-``Empty'' ready-queues means success rate of randomly guessing goes down.
210                \end{column}
211                \begin{column}{0.45\textwidth}
212                        \begin{table}
213                                {\resizebox{1\textwidth}{!}{\input{empty.dark.pstex_t}}}
214                        \end{table}
215                \end{column}
216        \end{columns}
217
218        Possible solutions:
219        \begin{itemize}
220                \item Data structure tracking the work, can be dense or sparse, global or sharded.
221                \item Add bias towards certain sub-queues.
222        \end{itemize}
223\end{frame}
224%------------------------------
225\begin{frame}{Dynamic Resizing}
226        Processors can be added at anytime on a cluster.
227        \newline
228
229        The array of queues needs to be adjusted in consequence.
230        \newline
231
232        Solution: Global Reader-Writer lock
233        \begin{itemize}
234                \item Acquire for reading for normal scheduling operations.
235                \item Acquire for writing when resizing the array and creating/deleting internal queues.
236        \end{itemize}
237\end{frame}
238%------------------------------
239\begin{frame}{Idle Sleep}
240        Processors which cannot find threads to run should sleep, using \texttt{pthread\_cond\_wait}, \texttt{sigwaitinfo}, etc.
241        \newline
242
243        Scheduling a thread may \textit{need} to wake sleeping processors.
244        \begin{itemize}
245                \item Threads can be scheduled from processors terminating or running outside the cluster.
246                In this case, all processors on the cluster could be sleeping.
247        \end{itemize}
248        ~
249
250        If \textit{some} processors are sleeping, waking more may be wasteful.
251
252        A heuristic for this case is outside the scope of this project.
253\end{frame}
254%------------------------------
255\begin{frame}{Asynchronous I/O}
256        \begin{itemize}
257                \item I/O Operations should block user-threads rather than kernel-threads. \vspace{1cm}
258                \item This requires 3 components:
259                \begin{enumerate}
260                        \item an OS abstraction layer over the asynchronous interface,  \vspace{0.2cm}
261                        \item an event-engine to (de)multiplex the operations,  \vspace{0.2cm}
262                        \item and a synchronous interface for users to use.  \vspace{0.2cm}
263                \end{enumerate}
264        \end{itemize}
265\end{frame}
266%------------------------------
267\begin{frame}{Asynchronous I/O: OS Abstraction}
268        \framesubtitle{\vskip0.5mm\large\texttt{select}}
269        \vskip5mm
270
271        {\large ``select() allows a program to monitor multiple file descriptors, waiting until one
272       or more of the file descriptors become ``ready'' for some class of I/O operation.''}
273
274        \hspace*\fill{\small--- Linux Programmer's Manual}
275
276        \vskip5mm
277        \begin{itemize}
278                \item[+] moderate overhead per \texttt{syscall}
279                \item[-] Relies on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
280        \end{itemize}
281\end{frame}
282%------------------------------
283\begin{frame}{Asynchronous I/O: OS Abstraction}
284        \framesubtitle{\vskip0.5mm\large\texttt{epoll}}
285        \vskip2mm
286
287        More recent system call with a similar purpose.
288
289        \vskip5mm
290        \begin{itemize}
291                \item[+] Smaller overhead per \texttt{syscall}.
292                \item[+] Shown to work well for sockets.
293                \item[-] Still relies on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
294                \item[-] Does not support linux pipes and TTYs.
295        \end{itemize}
296\end{frame}
297%------------------------------
298\begin{frame}{Asynchronous I/O: OS Abstraction}
299        \framesubtitle{\vskip0.5mm\large Kernel Threads}
300        \vskip2mm
301
302        Use a pool of kernel-threads, to which blocking calls are delegated.
303
304        \vskip5mm
305        \begin{itemize}
306                \item Technique used by many existing systems, e.g., Go, libuv
307                \item[+] Definitely works for all \texttt{syscall}s.
308                \item[$-$] Can require many kernel threads.
309        \end{itemize}
310\end{frame}
311%------------------------------
312\begin{frame}{Asynchronous I/O: OS Abstraction}
313        \framesubtitle{\vskip0.5mm\large\texttt{io\_uring}}
314        \vskip2mm
315
316        A very recent framework for asynchronous operations available in Linux 5.1 and later.
317        Uses two ring buffers to submit operations and poll completions.
318
319        \vskip5mm
320        \begin{itemize}
321                \item[+] Handles many \texttt{syscall}s.
322                \item[+] Does \textit{not} rely on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
323                \item[$-$] Requires synchronization on submission.
324                \item[$-$] System call itself is serialized in the kernel.
325        \end{itemize}
326\end{frame}
327%------------------------------
328\begin{frame}{Asynchronous I/O: Event Engine}
329        An event engine must be built to fit the chosen OS Abstraction.
330        \newline
331
332        The engine must park user-threads until operation is completed.
333        \newline
334
335        Depending on the chosen abstraction the engine may need to serialize operation submission.
336        \newline
337
338        Throughput and latency are important metrics.
339\end{frame}
340%------------------------------
341\begin{frame}{Asynchronous I/O: The interface}
342        The Asynchronous I/O needs an interface.
343        \newline
344
345        Several options to take into consideration:
346        \begin{itemize}
347                \item Adding to existing call interface, e.g., \texttt{read} and \texttt{cfaread}\vspace{0.2cm}
348                \item Replacing existing call interface.  \vspace{0.2cm}
349                \item True asynchronous interface, e.g., callbacks, futures.  \vspace{0.2cm}
350        \end{itemize}
351
352\end{frame}
353%==============================
354\section{Conclusion}
355%------------------------------
356\begin{frame}{Summary}
357        Runtime system and scheduling are still open topics.
358        \newline
359        \newline
360
361        This work offers a novel runtime and scheduling package.
362        \newline
363        \newline
364
365        Existing work only offers fragments that users must assemble themselves when possible.
366\end{frame}
367%------------------------------
368\begin{frame}{Timeline}
369        \begin{tabular}{ || m{0.1mm} m{0.75cm} m{1cm} | l }
370                % \hline
371                \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small May Oct} & {\small 2020 2020} & Creation of the performance benchmark. \\
372                \hline
373                \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small Nov Mar} & {\small 2020 2021} & Completion of the implementation. \\
374                \hline
375                \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small Mar Apr} & {\small 2021 2021} & Final performance experiments. \\
376                \hline
377                \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small May Aug} & {\small 2021 2021} & Thesis writing and defense. \\
378                % \hline
379        \end{tabular}
380\end{frame}
381
382%------------------------------
383\begin{frame}{}
384        \begin{center}
385                {\large Questions?}
386        \end{center}
387\end{frame}
388\end{document}
Note: See TracBrowser for help on using the repository browser.