# source:doc/theses/thierry_delisle_PhD/comp_II/presentation.tex@33f3cfb

arm-ehenumforall-pointer-decayjacob/cs343-translationnew-ast-unique-expr
Last change on this file since 33f3cfb was 8d8ac3b, checked in by Thierry Delisle <tdelisle@…>, 22 months ago

Added first draft of phd defence presentation.
Ran spell checker on CompII.
Formatting changes

• Property mode set to 100644
File size: 10.7 KB
Line
1\documentclass[english,aspectratio=169,svgnames,notes=hide,14pt,xcolor={dvipsnames}]{beamer}
2\usepackage{graphicx}
3\usepackage{epic,eepic}
4\usepackage{presentationstyle}
5
6\title{The \CFA Scheduler}
7\subtitle{PhD Comprehensive II Research Proposal}
8\author{Thierry Delisle}
9% \affil[1]{School of Computer Science, University of Waterloo}
10% \affil[ ]{\textit {tdelisle@uwaterloo.ca}}
11
12\begin{document}
13%==============================
14\miniframesoff
16        \titlepage
17\end{frame}
18%==============================
19\section{Introduction}
20%------------------------------
22        \tableofcontents
23\end{frame}
24\miniframeson
25%==============================
26\AtBeginSection[]{
27        \miniframesoff
28        \begin{frame}
29        \vfill
30        \centering
33        \end{beamercolorbox}
34        \vfill
35        \end{frame}
36        \miniframeson
37}
38\section{\CFA and Concurrency}
39\begin{frame}{\CFA}
40
41\end{frame}
42%------------------------------
43\begin{frame}{Concurrency in \CFA}
45        \begin{itemize}
47                \item User Level Context Switching causes kernel-threads to run a different user-thread.
48        \end{itemize}
49        ~\newline
51        \begin{itemize}
52                \item Clusters have their own kernel threads.
53                \item Threads in a cluster are on run on the kernel threads of that cluster.
54        \end{itemize}
55\end{frame}
56%------------------------------
57\begin{frame}{Concurrency in \CFA}
58        \begin{table}
59                {\resizebox{1\textwidth}{!}{\input{system.dark.pstex_t}}}
60        \end{table}
61\end{frame}
62%------------------------------
63\begin{frame}{Scheduling goal for \CFA}
64        {\large
65                \begin{center}
66                        The \CFA scheduler should be \textit{viable} for any workload.
67                \end{center}
68        }
69        ~\newline
70        This implies:
71        \begin{enumerate}
72                \item Producing a scheduler with sufficient fairness guarantees.
73                \item Handling kernel-threads running out of work.
74                \item Handling blocking I/O operations.
75        \end{enumerate}
76\end{frame}
77%==============================
78\section{Scheduling in Practice}
79%------------------------------
80\begin{frame}{In the Wild}
81        Schedulers found in production application generally fall into two categories:
82        \newline
83
84        \begin{itemize}
85                \item Feedback Scheduling\newline
86                \item Priority Scheduling (explicit or not)\newline
87        \end{itemize}
88\end{frame}
89%------------------------------
90\begin{frame}{Feedback Scheduling}
91        Most operating systems based their scheduling on feedback loops.
92        ~\newline
93
94        The scheduler runs a thread and adjusts some metric to choose when to run it, e.g., least CPU time first.
95        ~\newline
96
97        Relies on the following assumptions:
98        \begin{enumerate}
99                \item Threads live long enough for useful feedback information to be to gathered.
100                \item Threads belong to multiple users so fairness across threads is insufficient.
101        \end{enumerate}
102\end{frame}
103%------------------------------
104\begin{frame}{Priority Scheduling}
105        \begin{center}
106        {\large
108                }
109        \end{center}
110        \vspace{1em}
111
112        Explicit priorities:
113        \begin{itemize}
114                \item Threads given a priority at creation, e.g., Thread A has priority 1, Thread B has priority 6.
115        \end{itemize}
116        \vspace{0.75em}
117
118        Implicit priorities:
119        \begin{itemize}
120                \item Certain threads are preferred, based on various metrics, e.g., last run, last run on this CPU.
121        \end{itemize}
122\end{frame}
123%------------------------------
124\begin{frame}{Priority Scheduling: Work-Stealing}
125        Work-Stealing is a very popular strategy.
126        \begin{block}{Algorithm}
127                \begin{enumerate}
130                        \item If a processor's ready queue is empty, attempt to run threads from some other processor's ready queue.
131                \end{enumerate}
132        \end{block}
133        ~
134
135        Work-Stealing has implicit priorities: For a given processor, threads on it's queue have higher priority.
136
137        Processors begin busy for long periods can mean starvation.
138\end{frame}
139%==============================
140\section{Project: Proposal \& Details}
141%------------------------------
143        \CFA will have a single ready-queue per cluster.
144        \newline
145
146        The ready-queue will be sharded internally to reduce contention.
147        \newline
148
149        No strong coupling between internal queues and processors.
150        \newline
151
152        Constrasts with work-stealing which has a queue per processor.
153        \newline
154\end{frame}
155%------------------------------
157        \begin{table}
158                {\resizebox{0.8\textwidth}{!}{\input{base.dark.pstex_t}}}
159        \end{table}
160        ~
161\end{frame}
162%------------------------------
164        \begin{columns}
165                \begin{column}{0.55\textwidth}
166                        Semi-Empty'' ready-queues means success rate of randomly guessing goes down.
167                \end{column}
168                \begin{column}{0.45\textwidth}
169                        \begin{table}
170                                {\resizebox{1\textwidth}{!}{\input{empty.dark.pstex_t}}}
171                        \end{table}
172                \end{column}
173        \end{columns}
174
175        Possible solutions:
176        \begin{itemize}
177                \item Data structure tracking the work, can be dense or sparse, global or sharded.
178                \item Add bias towards certain sub-queues.
179        \end{itemize}
180\end{frame}
181%------------------------------
182\begin{frame}{Dynamic Resizing}
183        Processors can be added at anytime on a cluster.
184        \newline
185
186        The array of queues needs to be adjusted in consequence.
187        \newline
188
190        \begin{itemize}
191                \item Acquire for reading for normal scheduling operations.
192                \item Acquire for right when resizing the array and creating/deleting internal queues.
193        \end{itemize}
194\end{frame}
195%------------------------------
196\begin{frame}{Idle Sleep}
197        Processors which cannot find threads to run should sleep, using \texttt{pthread\_cond\_wait}, \texttt{sigwaitinfo}, etc.
198        \newline
199
200        Scheduling a thread may \textit{need} to wake sleeping processors.
201        \begin{itemize}
202                \item Threads can be scheduled from processors terminating or running outside the cluster.
203                In this case, all processors on the cluster could be sleeping.
204        \end{itemize}
205        ~
206
207        If \textit{some} processors are sleeping, waking more may be wasteful.
208
209        A heuristic for this case is outside the scope of this project.
210\end{frame}
211%------------------------------
212\begin{frame}{Asynchronous I/O}
213        \begin{itemize}
215                \item This requires 3 components:
216                \begin{enumerate}
217                        \item an OS abstraction layer over the asynchronous interface,  \vspace{0.2cm}
218                        \item an event-engine to (de)multiplex the operations,  \vspace{0.2cm}
219                        \item and a synchronous interface for users to use.  \vspace{0.2cm}
220                \end{enumerate}
221        \end{itemize}
222\end{frame}
223%------------------------------
224\begin{frame}{Asynchronous I/O: OS Abstraction}
225        \framesubtitle{\vskip0.5mm\large\texttt{select}}
226        \vskip5mm
227
228        {\large select() allows a program to monitor multiple file descriptors, waiting until one
229       or more of the file descriptors become ready'' for some class of I/O operation.''}
230
231        \hspace*\fill{\small--- Linux Programmer's Manual}
232
233        \vskip5mm
234        \begin{itemize}
235                \item[+] moderate overhead per \texttt{syscall}
236                \item[-] Relies on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
237        \end{itemize}
238\end{frame}
239%------------------------------
240\begin{frame}{Asynchronous I/O: OS Abstraction}
241        \framesubtitle{\vskip0.5mm\large\texttt{epoll}}
242        \vskip2mm
243
244        More recent system call with a similar purpose.
245
246        \vskip5mm
247        \begin{itemize}
248                \item[+] Smaller overhead per \texttt{syscall}.
249                \item[+] Shown to work well for sockets.
250                \item[-] Still relies on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
251                \item[-] Does not support linux pipes and TTYs.
252        \end{itemize}
253\end{frame}
254%------------------------------
255\begin{frame}{Asynchronous I/O: OS Abstraction}
257        \vskip2mm
258
259        Use a pool of kernel-threads, to which blocking calls are delegated.
260
261        \vskip5mm
262        \begin{itemize}
263                \item Technique used by many existing systems, e.g., Go, libuv
264                \item[+] Definitely works for all \texttt{syscall}s.
265                \item[$-$] Can require many kernel threads.
266        \end{itemize}
267\end{frame}
268%------------------------------
269\begin{frame}{Asynchronous I/O: OS Abstraction}
270        \framesubtitle{\vskip0.5mm\large\texttt{io\_uring}}
271        \vskip2mm
272
273        A very recent framework for asynchronous operations available in Linux 5.1 and later.
274        Uses two ring buffers to submit operations and poll completions.
275
276        \vskip5mm
277        \begin{itemize}
278                \item[+] Handles many \texttt{syscall}s.
279                \item[+] Does \textit{not} rely on \texttt{syscall}s returning \texttt{EWOULDBLOCK}.
280                \item[$-$] Requires synchronization on submission.
281                \item[$-$] System call itself is serialized in the kernel.
282        \end{itemize}
283\end{frame}
284%------------------------------
285\begin{frame}{Asynchronous I/O: Event Engine}
286        An event engine must be built to fit the chosen OS Abstraction.
287        \newline
288
289        The engine must park user-threads until operation is completed.
290        \newline
291
292        Depending on the chosen abstraction the engine may need to serialize operation submission.
293        \newline
294
295        Throughput and latency are important metrics.
296\end{frame}
297%------------------------------
298\begin{frame}{Asynchronous I/O: The interface}
299        The Asynchronous I/O needs an interface.
300        \newline
301
302        Several options to take into consideration:
303        \begin{itemize}
305                \item Replacing existing call interface.  \vspace{0.2cm}
306                \item True asynchronous interface, e.g., callbacks, futures.  \vspace{0.2cm}
307        \end{itemize}
308
309\end{frame}
310%==============================
311\section{Conclusion}
312%------------------------------
313\begin{frame}{Summary}
314        Runtime system and scheduling are still open topics.
315        \newline
316
317        This work offers a novel runtime and scheduling package.
318        \newline
319
320        Existing work only offers fragments that users must assemble themselves when possible.
321\end{frame}
322%------------------------------
323\begin{frame}{Timeline}
324        \begin{tabular}{ || m{0.1mm} m{0.75cm} m{1cm} | l }
325                % \hline
326                \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small May Oct} & {\small 2020 2020} & Creation of the performance benchmark. \\
327                \hline
328                \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small Nov Mar} & {\small 2020 2021} & Completion of the implementation. \\
329                \hline
330                \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small Mar Apr} & {\small 2021 2021} & Final performance experiments. \\
331                \hline
332                \phantom{100000cm} \phantom{100000cm} \phantom{100000cm} & {\small May Aug} & {\small 2021 2021} & Thesis writing and defense. \\
333                % \hline
334        \end{tabular}
335\end{frame}
336
337%------------------------------
338\begin{frame}{Timeline}
339        \begin{center}
340                {\large Questions?}
341        \end{center}
342\end{frame}
343\end{document}
Note: See TracBrowser for help on using the repository browser.