Changeset 2ae16219 for doc/papers/concurrency/Paper.tex
- Timestamp:
- Apr 19, 2018, 11:04:47 AM (8 years ago)
- Branches:
- ADT, aaron-thesis, arm-eh, ast-experimental, cleanup-dtors, deferred_resn, demangler, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, new-env, no_list, persistent-indexer, pthread-emulation, qualifiedEnum, with_gc
- Children:
- c28bcc7
- Parents:
- b2da0574 (diff), 61323a7 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/papers/concurrency/Paper.tex
rb2da0574 r2ae16219 12 12 13 13 % Latex packages used in the document. 14 14 15 \usepackage{epic,eepic} 15 16 \usepackage{xspace} … … 17 18 \usepackage{upquote} % switch curled `'" to straight 18 19 \usepackage{listings} % format program code 19 \usepackage[labelformat=simple ]{subfig}20 \usepackage[labelformat=simple,aboveskip=0pt,farskip=0pt]{subfig} 20 21 \renewcommand{\thesubfigure}{(\alph{subfigure})} 21 22 \usepackage{siunitx} 22 23 \sisetup{ binary-units=true } 23 %\input{style} % bespoke macros used in the document24 24 25 25 \hypersetup{breaklinks=true} … … 31 31 \renewcommand{\linenumberfont}{\scriptsize\sffamily} 32 32 33 \lefthyphenmin=4 % hyphen only after 4 characters 34 \righthyphenmin=4 33 \renewcommand{\textfraction}{0.0} % the entire page maybe devoted to floats with no text on the page at all 34 35 \lefthyphenmin=3 % hyphen only after 4 characters 36 \righthyphenmin=3 35 37 36 38 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% … … 95 97 % Latin abbreviation 96 98 \newcommand{\abbrevFont}{\textit} % set empty for no italics 99 \@ifundefined{eg}{ 97 100 \newcommand{\EG}{\abbrevFont{e}.\abbrevFont{g}.} 98 101 \newcommand*{\eg}{% … … 100 103 {\@ifnextchar{:}{\EG}% 101 104 {\EG,\xspace}}% 102 }% 105 }}{}% 106 \@ifundefined{ie}{ 103 107 \newcommand{\IE}{\abbrevFont{i}.\abbrevFont{e}.} 104 108 \newcommand*{\ie}{% … … 106 110 {\@ifnextchar{:}{\IE}% 107 111 {\IE,\xspace}}% 108 }% 112 }}{}% 113 \@ifundefined{etc}{ 109 114 \newcommand{\ETC}{\abbrevFont{etc}} 110 115 \newcommand*{\etc}{% 111 116 \@ifnextchar{.}{\ETC}% 112 117 {\ETC.\xspace}% 113 }% 118 }}{}% 119 \@ifundefined{etal}{ 114 120 \newcommand{\ETAL}{\abbrevFont{et}~\abbrevFont{al}} 115 \ renewcommand*{\etal}{%121 \newcommand*{\etal}{% 116 122 \@ifnextchar{.}{\protect\ETAL}% 117 123 {\protect\ETAL.\xspace}% 118 }% 124 }}{}% 125 \@ifundefined{viz}{ 119 126 \newcommand{\VIZ}{\abbrevFont{viz}} 120 127 \newcommand*{\viz}{% 121 128 \@ifnextchar{.}{\VIZ}% 122 129 {\VIZ.\xspace}% 123 } %130 }}{}% 124 131 \makeatother 125 132 … … 134 141 \lstdefinelanguage{CFA}[ANSI]{C}{ 135 142 morekeywords={ 136 _Alignas, _Alignof, __alignof, __alignof__, asm, __asm, __asm__, _At, __attribute, 137 __attribute__, auto, _Bool, catch, catchResume, choose, _Complex, __complex, __complex__, 138 __const, __const__, disable, dtype, enable, exception, __extension__, fallthrough, fallthru, 139 finally, forall, ftype, _Generic, _Imaginary, inline, __label__, lvalue, _Noreturn, one_t, 140 otype, restrict, _Static_assert, throw, throwResume, trait, try, ttype, typeof, __typeof, 141 __typeof__, virtual, with, zero_t}, 142 morekeywords=[2]{ 143 _Atomic, coroutine, is_coroutine, is_monitor, is_thread, monitor, mutex, nomutex, or, 144 resume, suspend, thread, _Thread_local, waitfor, when, yield}, 143 _Alignas, _Alignof, __alignof, __alignof__, asm, __asm, __asm__, __attribute, __attribute__, 144 auto, _Bool, catch, catchResume, choose, _Complex, __complex, __complex__, __const, __const__, 145 coroutine, disable, dtype, enable, __extension__, exception, fallthrough, fallthru, finally, 146 __float80, float80, __float128, float128, forall, ftype, _Generic, _Imaginary, __imag, __imag__, 147 inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or, 148 otype, restrict, __restrict, __restrict__, __signed, __signed__, _Static_assert, thread, 149 _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__, 150 virtual, __volatile, __volatile__, waitfor, when, with, zero_t}, 145 151 moredirectives={defined,include_next}% 146 152 } … … 212 218 \authormark{Thierry Delisle \textsc{et al}} 213 219 214 \address[1]{\orgdiv{ David R.Cheriton School of Computer Science}, \orgname{University of Waterloo}, \orgaddress{\state{Ontario}, \country{Canada}}}220 \address[1]{\orgdiv{Cheriton School of Computer Science}, \orgname{University of Waterloo}, \orgaddress{\state{Ontario}, \country{Canada}}} 215 221 216 222 \corres{*Peter A. Buhr, \email{pabuhr{\char`\@}uwaterloo.ca}} 217 \presentaddress{ David R.Cheriton School of Computer Science, University of Waterloo, Waterloo, ON, N2L 3G1, Canada}223 \presentaddress{Cheriton School of Computer Science, University of Waterloo, Waterloo, ON, N2L 3G1, Canada} 218 224 219 225 … … 229 235 }% 230 236 231 \keywords{concurrency, runtime, coroutines, threads, C, Cforall}237 \keywords{concurrency, parallelism, coroutines, threads, monitors, runtime, C, Cforall} 232 238 233 239 … … 243 249 % ====================================================================== 244 250 245 This paper provides a minimal concurrency \newterm{A PI}that is simple, efficient and can be used to build other concurrency features.251 This paper provides a minimal concurrency \newterm{Abstract Program Interface} (API) that is simple, efficient and can be used to build other concurrency features. 246 252 While the simplest concurrency system is a thread and a lock, this low-level approach is hard to master. 247 253 An easier approach for programmers is to support higher-level constructs as the basis of concurrency. … … 249 255 Examples of high-level approaches are task based~\cite{TBB}, message passing~\cite{Erlang,MPI}, and implicit threading~\cite{OpenMP}. 250 256 251 Th e terminology used in this paper is as follows.257 This paper used the following terminology. 252 258 A \newterm{thread} is a fundamental unit of execution that runs a sequence of code and requires a stack to maintain state. 253 Multiple simultaneous threads gives rise to \newterm{concurrency}, which requires locking to ensure safe access to shared data.254 % Correspondingly, concurrency is defined as the concepts and challenges that occur when multiple independent (sharing memory, timing dependencies, etc.) concurrent threads are introduced.259 Multiple simultaneous threads gives rise to \newterm{concurrency}, which requires locking to ensure safe communication and access to shared data. 260 % Correspondingly, concurrency is defined as the concepts and challenges that occur when multiple independent (sharing memory, timing dependencies, \etc) concurrent threads are introduced. 255 261 \newterm{Locking}, and by extension locks, are defined as a mechanism to prevent progress of threads to provide safety. 256 262 \newterm{Parallelism} is running multiple threads simultaneously. … … 259 265 260 266 Hence, there are two problems to be solved in the design of concurrency for a programming language: concurrency and parallelism. 261 While these two concepts are often combined, they are in fact distinct, requiring different tools~\cite {Buhr05a}.262 Concurrency tools handle mutual exclusion and synchronization, while parallelism tools handle performance, cost and resource utilization.267 While these two concepts are often combined, they are in fact distinct, requiring different tools~\cite[\S~2]{Buhr05a}. 268 Concurrency tools handle synchronization and mutual exclusion, while parallelism tools handle performance, cost and resource utilization. 263 269 264 270 The proposed concurrency API is implemented in a dialect of C, called \CFA. … … 277 283 Like C, the basics of \CFA revolve around structures and routines, which are thin abstractions over machine code. 278 284 The vast majority of the code produced by the \CFA translator respects memory layouts and calling conventions laid out by C. 279 Interestingly, while \CFA is not an object-oriented language, lacking the concept of a receiver ( e.g.,{\tt this}), it does have some notion of objects\footnote{C defines the term objects as : ``region of data storage in the execution environment, the contents of which can represent285 Interestingly, while \CFA is not an object-oriented language, lacking the concept of a receiver (\eg {\tt this}), it does have some notion of objects\footnote{C defines the term objects as : ``region of data storage in the execution environment, the contents of which can represent 280 286 values''~\cite[3.15]{C11}}, most importantly construction and destruction of objects. 281 287 Most of the following code examples can be found on the \CFA website~\cite{Cforall}. … … 329 335 \subsection{Operators} 330 336 Overloading also extends to operators. 331 The syntax for denoting operator-overloading is to name a routine with the symbol of the operator and question marks where the arguments of the operation appear, e.g.:337 The syntax for denoting operator-overloading is to name a routine with the symbol of the operator and question marks where the arguments of the operation appear, \eg: 332 338 \begin{cfa} 333 339 int ++? (int op); $\C{// unary prefix increment}$ … … 420 426 421 427 Note that the type use for assertions can be either an @otype@ or a @dtype@. 422 Types declared as @otype@ refer to ``complete'' objects, i.e.,objects with a size, a default constructor, a copy constructor, a destructor and an assignment operator.428 Types declared as @otype@ refer to ``complete'' objects, \ie objects with a size, a default constructor, a copy constructor, a destructor and an assignment operator. 423 429 Using @dtype@, on the other hand, has none of these assumptions but is extremely restrictive, it only guarantees the object is addressable. 424 430 … … 458 464 % ====================================================================== 459 465 % ====================================================================== 460 Before any detailed discussion of the concurrency and parallelism in \CFA, it is important to describe the basics of concurrency and how they are expressed in \CFA user code. 461 462 \section{Basics of concurrency} 466 463 467 At its core, concurrency is based on having multiple call-stacks and scheduling among threads of execution executing on these stacks. 464 Concurrency without parallelism only requires having multiple call stacks (or contexts) for a single thread of execution. 465 466 Execution with a single thread and multiple stacks where the thread is self-scheduling deterministically across the stacks is called coroutining. 467 Execution with a single and multiple stacks but where the thread is scheduled by an oracle (non-deterministic from the thread's perspective) across the stacks is called concurrency. 468 469 Therefore, a minimal concurrency system can be achieved by creating coroutines (see Section \ref{coroutine}), which instead of context-switching among each other, always ask an oracle where to context-switch next. 468 Multiple call stacks (or contexts) and a single thread of execution does \emph{not} imply concurrency. 469 Execution with a single thread and multiple stacks where the thread is deterministically self-scheduling across the stacks is called \newterm{coroutining}; 470 execution with a single thread and multiple stacks but where the thread is scheduled by an oracle (non-deterministic from the thread's perspective) across the stacks is called concurrency~\cite[\S~3]{Buhr05a}. 471 Therefore, a minimal concurrency system can be achieved using coroutines (see Section \ref{coroutine}), which instead of context-switching among each other, always defer to an oracle for where to context-switch next. 472 470 473 While coroutines can execute on the caller's stack-frame, stack-full coroutines allow full generality and are sufficient as the basis for concurrency. 471 474 The aforementioned oracle is a scheduler and the whole system now follows a cooperative threading-model (a.k.a., non-preemptive scheduling). … … 480 483 481 484 482 \s ection{\protect\CFA's Thread Building Blocks}485 \subsection{\protect\CFA's Thread Building Blocks} 483 486 484 487 One of the important features that are missing in C is threading\footnote{While the C11 standard defines a ``threads.h'' header, it is minimal and defined as optional. … … 490 493 491 494 492 \section{Coroutines: A Stepping Stone}\label{coroutine} 493 494 While the main focus of this proposal is concurrency and parallelism, it is important to address coroutines, which are actually a significant building block of a concurrency system. \textbf{Coroutine}s are generalized routines which have predefined points where execution is suspended and can be resumed at a later time. 495 Therefore, they need to deal with context switches and other context-management operations. 496 This proposal includes coroutines both as an intermediate step for the implementation of threads, and a first-class feature of \CFA. 497 Furthermore, many design challenges of threads are at least partially present in designing coroutines, which makes the design effort that much more relevant. 498 The core \textbf{api} of coroutines revolves around two features: independent call-stacks and @suspend@/@resume@. 495 \subsection{Coroutines: A Stepping Stone}\label{coroutine} 496 497 While the focus of this proposal is concurrency and parallelism, it is important to address coroutines, which are a significant building block of a concurrency system. 498 \newterm{Coroutine}s are generalized routines with points where execution is suspended and resumed at a later time. 499 Suspend/resume is a context switche and coroutines have other context-management operations. 500 Many design challenges of threads are partially present in designing coroutines, which makes the design effort relevant. 501 The core \textbf{api} of coroutines has two features: independent call-stacks and @suspend@/@resume@. 502 503 A coroutine handles the class of problems that need to retain state between calls (\eg plugin, device driver, finite-state machine). 504 For example, a problem made easier with coroutines is unbounded generators, \eg generating an infinite sequence of Fibonacci numbers: 505 \begin{displaymath} 506 f(n) = \left \{ 507 \begin{array}{ll} 508 0 & n = 0 \\ 509 1 & n = 1 \\ 510 f(n-1) + f(n-2) & n \ge 2 \\ 511 \end{array} 512 \right. 513 \end{displaymath} 514 Figure~\ref{f:C-fibonacci} shows conventional approaches for writing a Fibonacci generator in C. 515 516 Figure~\ref{f:GlobalVariables} illustrates the following problems: 517 unencapsulated global variables necessary to retain state between calls; 518 only one fibonacci generator can run at a time; 519 execution state must be explicitly retained. 520 Figure~\ref{f:ExternalState} addresses these issues: 521 unencapsulated program global variables become encapsulated structure variables; 522 multiple fibonacci generators can run at a time by declaring multiple fibonacci objects; 523 explicit execution state is removed by precomputing the first two Fibonacci numbers and returning $f(n-2)$. 499 524 500 525 \begin{figure} 501 \begin{center} 502 \begin{tabular}{@{}lll@{}} 503 \multicolumn{1}{c}{\textbf{callback}} & \multicolumn{1}{c}{\textbf{output array}} & \multicolumn{1}{c}{\textbf{external state}} \\ 504 \begin{cfa} 505 void fib_func( 506 int n, void (* callback)( int ) 507 ) { 508 int fn, f1 = 0, f2 = 1; 509 for ( int i = 0; i < n; i++ ) { 510 callback( f1 ); 511 fn = f1 + f2; 512 f1 = f2; f2 = fn; 526 \centering 527 \newbox\myboxA 528 \begin{lrbox}{\myboxA} 529 \begin{lstlisting}[aboveskip=0pt,belowskip=0pt] 530 `int f1, f2, state = 1;` // single global variables 531 int fib() { 532 int fn; 533 `switch ( state )` { // explicit execution state 534 case 1: fn = 0; f1 = fn; state = 2; break; 535 case 2: fn = 1; f2 = f1; f1 = fn; state = 3; break; 536 case 3: fn = f1 + f2; f2 = f1; f1 = fn; break; 513 537 } 538 return fn; 514 539 } 515 540 int main() { 516 void print_fib( int n ) { 517 printf( "%d\n", n ); 541 542 for ( int i = 0; i < 10; i += 1 ) { 543 printf( "%d\n", fib() ); 518 544 } 519 fib_func( 10, print_fib ); 520 } 521 522 \end{cfa} 523 & 524 \begin{cfa} 525 void fib_array( 526 int n, int * array 527 ) { 528 int fn, f1 = 0, f2 = 1; 529 for ( int i = 0; i < n; i++ ) { 530 array[i] = f1; 531 fn = f1 + f2; 532 f1 = f2; f2 = fn; 545 } 546 \end{lstlisting} 547 \end{lrbox} 548 549 \newbox\myboxB 550 \begin{lrbox}{\myboxB} 551 \begin{lstlisting}[aboveskip=0pt,belowskip=0pt] 552 #define FIB_INIT `{ 0, 1 }` 553 typedef struct { int f2, f1; } Fib; 554 int fib( Fib * f ) { 555 556 int ret = f->f2; 557 int fn = f->f1 + f->f2; 558 f->f2 = f->f1; f->f1 = fn; 559 560 return ret; 561 } 562 int main() { 563 Fib f1 = FIB_INIT, f2 = FIB_INIT; 564 for ( int i = 0; i < 10; i += 1 ) { 565 printf( "%d %d\n", fib( &f1 ), fib( &f2 ) ); 533 566 } 534 567 } 568 \end{lstlisting} 569 \end{lrbox} 570 571 \subfloat[3 States: global variables]{\label{f:GlobalVariables}\usebox\myboxA} 572 \qquad 573 \subfloat[1 State: external variables]{\label{f:ExternalState}\usebox\myboxB} 574 \caption{C Fibonacci Implementations} 575 \label{f:C-fibonacci} 576 577 \bigskip 578 579 \newbox\myboxA 580 \begin{lrbox}{\myboxA} 581 \begin{lstlisting}[aboveskip=0pt,belowskip=0pt] 582 `coroutine` Fib { int fn; }; 583 void main( Fib & f ) with( f ) { 584 int f1, f2; 585 fn = 0; f1 = fn; `suspend()`; 586 fn = 1; f2 = f1; f1 = fn; `suspend()`; 587 for ( ;; ) { 588 fn = f1 + f2; f2 = f1; f1 = fn; `suspend()`; 589 } 590 } 591 int next( Fib & fib ) with( fib ) { 592 `resume( fib );` 593 return fn; 594 } 535 595 int main() { 536 int a[10]; 537 fib_array( 10, a ); 538 for ( int i = 0; i < 10; i++ ) { 539 printf( "%d\n", a[i] ); 596 Fib f1, f2; 597 for ( int i = 1; i <= 10; i += 1 ) { 598 sout | next( f1 ) | next( f2 ) | endl; 540 599 } 541 600 } 542 \end{cfa} 543 & 544 \begin{cfa} 545 546 typedef struct { int f1, f2; } Fib; 547 int fib_state( 548 Fib * fib 549 ) { 550 int ret = fib->f1; 551 int fn = fib->f1 + fib->f2; 552 fib->f2 = fib->f1; fib->f1 = fn; 601 \end{lstlisting} 602 \end{lrbox} 603 \newbox\myboxB 604 \begin{lrbox}{\myboxB} 605 \begin{lstlisting}[aboveskip=0pt,belowskip=0pt] 606 `coroutine` Fib { int ret; }; 607 void main( Fib & f ) with( f ) { 608 int fn, f1 = 1, f2 = 0; 609 for ( ;; ) { 610 ret = f2; 611 612 fn = f1 + f2; f2 = f1; f1 = fn; `suspend();` 613 } 614 } 615 int next( Fib & fib ) with( fib ) { 616 `resume( fib );` 553 617 return ret; 554 618 } 555 int main() { 556 Fib fib = { 0, 1 }; 557 558 for ( int i = 0; i < 10; i++ ) { 559 printf( "%d\n", fib_state( &fib ) ); 560 } 561 } 562 \end{cfa} 563 \end{tabular} 564 \end{center} 565 \caption{Fibonacci Implementations in C} 566 \label{lst:fib-c} 619 620 621 622 623 624 625 \end{lstlisting} 626 \end{lrbox} 627 \subfloat[3 States, internal variables]{\label{f:Coroutine3States}\usebox\myboxA} 628 \qquad 629 \subfloat[1 State, internal variables]{\label{f:Coroutine1State}\usebox\myboxB} 630 \caption{\CFA Coroutine Fibonacci Implementations} 631 \label{f:fibonacci-cfa} 567 632 \end{figure} 568 633 569 A good example of a problem made easier with coroutines is generators, e.g., generating the Fibonacci sequence. 570 This problem comes with the challenge of decoupling how a sequence is generated and how it is used. 571 Listing \ref{lst:fibonacci-c} shows conventional approaches to writing generators in C. 572 All three of these approach suffer from strong coupling. 573 The left and centre approaches require that the generator have knowledge of how the sequence is used, while the rightmost approach requires holding internal state between calls on behalf of the generator and makes it much harder to handle corner cases like the Fibonacci seed. 574 575 Listing \ref{lst:fibonacci-cfa} is an example of a solution to the Fibonacci problem using \CFA coroutines, where the coroutine stack holds sufficient state for the next generation. 634 Figure~\ref{f:Coroutine3States} creates a @coroutine@ type, which provides communication for multiple interface functions, and the \newterm{coroutine main}, which runs on the coroutine stack. 635 \begin{cfa} 636 `coroutine C { char c; int i; _Bool s; };` $\C{// used for communication}$ 637 void ?{}( C & c ) { s = false; } $\C{// constructor}$ 638 void main( C & cor ) with( cor ) { $\C{// actual coroutine}$ 639 while ( ! s ) // process c 640 if ( v == ... ) s = false; 641 } 642 // interface functions 643 char cont( C & cor, char ch ) { c = ch; resume( cor ); return c; } 644 _Bool stop( C & cor, int v ) { s = true; i = v; resume( cor ); return s; } 645 \end{cfa} 646 647 encapsulates the Fibonacci state in the shows is an example of a solution to the Fibonacci problem using \CFA coroutines, where the coroutine stack holds sufficient state for the next generation. 576 648 This solution has the advantage of having very strong decoupling between how the sequence is generated and how it is used. 577 649 Indeed, this version is as easy to use as the @fibonacci_state@ solution, while the implementation is very similar to the @fibonacci_func@ example. 578 650 651 Figure~\ref{f:fmt-line} shows the @Format@ coroutine for restructuring text into groups of character blocks of fixed size. 652 The example takes advantage of resuming coroutines in the constructor to simplify the code and highlights the idea that interesting control flow can occur in the constructor. 653 579 654 \begin{figure} 580 \begin{cfa} 581 coroutine Fibonacci { int fn; }; $\C{// used for communication}$ 582 583 void ?{}( Fibonacci & fib ) with( fib ) { fn = 0; } $\C{// constructor}$ 584 585 void main( Fibonacci & fib ) with( fib ) { $\C{// main called on first resume}$ 586 int fn1, fn2; $\C{// retained between resumes}$ 587 fn = 0; fn1 = fn; $\C{// 1st case}$ 588 suspend(); $\C{// restart last resume}$ 589 fn = 1; fn2 = fn1; fn1 = fn; $\C{// 2nd case}$ 590 suspend(); $\C{// restart last resume}$ 591 for ( ;; ) { 592 fn = fn1 + fn2; fn2 = fn1; fn1 = fn; $\C{// general case}$ 593 suspend(); $\C{// restart last resume}$ 655 \centering 656 \begin{cfa} 657 `coroutine` Format { 658 char ch; $\C{// used for communication}$ 659 int g, b; $\C{// global because used in destructor}$ 660 }; 661 void ?{}( Format & fmt ) { `resume( fmt );` } $\C{// prime (start) coroutine}$ 662 void ^?{}( Format & fmt ) with( fmt ) { if ( g != 0 || b != 0 ) sout | endl; } 663 void main( Format & fmt ) with( fmt ) { 664 for ( ;; ) { $\C{// for as many characters}$ 665 for ( g = 0; g < 5; g += 1 ) { $\C{// groups of 5 blocks}$ 666 for ( b = 0; b < 4; b += 1 ) { $\C{// blocks of 4 characters}$ 667 `suspend();` 668 sout | ch; $\C{// print character}$ 669 } 670 sout | " "; $\C{// print block separator}$ 671 } 672 sout | endl; $\C{// print group separator}$ 594 673 } 595 674 } 596 int next( Fibonacci & fib ) with( fib ) { 597 resume( fib ); $\C{// restart last suspend}$ 598 return fn; 599 } 600 int main() { 601 Fibonacci f1, f2; 602 for ( int i = 1; i <= 10; i++ ) { 603 sout | next( f1 ) | next( f2 ) | endl; 604 } 605 } 606 \end{cfa} 607 \caption{Coroutine Fibonacci } 608 \label{lst:fibonacci-cfa} 609 \end{figure} 610 611 Listing \ref{lst:fmt-line} shows the @Format@ coroutine for restructuring text into groups of character blocks of fixed size. 612 The example takes advantage of resuming coroutines in the constructor to simplify the code and highlights the idea that interesting control flow can occur in the constructor. 613 614 \begin{figure} 615 \begin{cfa}[tabsize=3,caption={Formatting text into lines of 5 blocks of 4 characters.},label={lst:fmt-line}] 616 // format characters into blocks of 4 and groups of 5 blocks per line 617 coroutine Format { 618 char ch; // used for communication 619 int g, b; // global because used in destructor 620 }; 621 622 void ?{}(Format& fmt) { 623 resume( fmt ); // prime (start) coroutine 624 } 625 626 void ^?{}(Format& fmt) with fmt { 627 if ( fmt.g != 0 || fmt.b != 0 ) 628 sout | endl; 629 } 630 631 void main(Format& fmt) with fmt { 632 for ( ;; ) { // for as many characters 633 for(g = 0; g < 5; g++) { // groups of 5 blocks 634 for(b = 0; b < 4; fb++) { // blocks of 4 characters 635 suspend(); 636 sout | ch; // print character 637 } 638 sout | " "; // print block separator 639 } 640 sout | endl; // print group separator 641 } 642 } 643 644 void prt(Format & fmt, char ch) { 675 void prt( Format & fmt, char ch ) { 645 676 fmt.ch = ch; 646 resume(fmt); 647 } 648 677 `resume( fmt );` 678 } 649 679 int main() { 650 680 Format fmt; 651 681 char ch; 652 Eof: for ( ;; ) { // read until end of file653 sin | ch; // read one character654 if(eof(sin)) break Eof; // eof ?655 prt( fmt, ch); // push character for formatting682 for ( ;; ) { $\C{// read until end of file}$ 683 sin | ch; $\C{// read one character}$ 684 if ( eof( sin ) ) break; $\C{// eof ?}$ 685 prt( fmt, ch ); $\C{// push character for formatting}$ 656 686 } 657 687 } 658 688 \end{cfa} 689 \caption{Formatting text into lines of 5 blocks of 4 characters.} 690 \label{f:fmt-line} 659 691 \end{figure} 660 692 661 \subsection{Construction} 693 \begin{figure} 694 \centering 695 \lstset{language=CFA,escapechar={},moredelim=**[is][\protect\color{red}]{`}{`}} 696 \begin{tabular}{@{}l@{\hspace{2\parindentlnth}}l@{}} 697 \begin{cfa} 698 `coroutine` Prod { 699 Cons & c; 700 int N, money, receipt; 701 }; 702 void main( Prod & prod ) with( prod ) { 703 // 1st resume starts here 704 for ( int i = 0; i < N; i += 1 ) { 705 int p1 = random( 100 ), p2 = random( 100 ); 706 sout | p1 | " " | p2 | endl; 707 int status = delivery( c, p1, p2 ); 708 sout | " $" | money | endl | status | endl; 709 receipt += 1; 710 } 711 stop( c ); 712 sout | "prod stops" | endl; 713 } 714 int payment( Prod & prod, int money ) { 715 prod.money = money; 716 `resume( prod );` 717 return prod.receipt; 718 } 719 void start( Prod & prod, int N, Cons &c ) { 720 &prod.c = &c; 721 prod.[N, receipt] = [N, 0]; 722 `resume( prod );` 723 } 724 int main() { 725 Prod prod; 726 Cons cons = { prod }; 727 srandom( getpid() ); 728 start( prod, 5, cons ); 729 } 730 \end{cfa} 731 & 732 \begin{cfa} 733 `coroutine` Cons { 734 Prod & p; 735 int p1, p2, status; 736 _Bool done; 737 }; 738 void ?{}( Cons & cons, Prod & p ) { 739 &cons.p = &p; 740 cons.[status, done ] = [0, false]; 741 } 742 void ^?{}( Cons & cons ) {} 743 void main( Cons & cons ) with( cons ) { 744 // 1st resume starts here 745 int money = 1, receipt; 746 for ( ; ! done; ) { 747 sout | p1 | " " | p2 | endl | " $" | money | endl; 748 status += 1; 749 receipt = payment( p, money ); 750 sout | " #" | receipt | endl; 751 money += 1; 752 } 753 sout | "cons stops" | endl; 754 } 755 int delivery( Cons & cons, int p1, int p2 ) { 756 cons.[p1, p2] = [p1, p2]; 757 `resume( cons );` 758 return cons.status; 759 } 760 void stop( Cons & cons ) { 761 cons.done = true; 762 `resume( cons );` 763 } 764 765 \end{cfa} 766 \end{tabular} 767 \caption{Producer / consumer: resume-resume cycle, bi-directional communication} 768 \label{f:ProdCons} 769 \end{figure} 770 771 772 \subsubsection{Construction} 773 662 774 One important design challenge for implementing coroutines and threads (shown in section \ref{threads}) is that the runtime system needs to run code after the user-constructor runs to connect the fully constructed object into the system. 663 775 In the case of coroutines, this challenge is simpler since there is no non-determinism from preemption or scheduling. … … 702 814 } 703 815 \end{cfa} 704 The problem in this example is a storage management issue, the function pointer @_thunk0@ is only valid until the end of the block, which limits the viable solutions because storing the function pointer for too long causes undefined behaviour; i.e.,the stack-based thunk being destroyed before it can be used.816 The problem in this example is a storage management issue, the function pointer @_thunk0@ is only valid until the end of the block, which limits the viable solutions because storing the function pointer for too long causes undefined behaviour; \ie the stack-based thunk being destroyed before it can be used. 705 817 This challenge is an extension of challenges that come with second-class routines. 706 818 Indeed, GCC nested routines also have the limitation that nested routine cannot be passed outside of the declaration scope. 707 819 The case of coroutines and threads is simply an extension of this problem to multiple call stacks. 708 820 709 \subsection{Alternative: Composition} 821 822 \subsubsection{Alternative: Composition} 823 710 824 One solution to this challenge is to use composition/containment, where coroutine fields are added to manage the coroutine. 711 825 … … 731 845 This opens the door for user errors and requires extra runtime storage to pass at runtime information that can be known statically. 732 846 733 \subsection{Alternative: Reserved keyword} 847 848 \subsubsection{Alternative: Reserved keyword} 849 734 850 The next alternative is to use language support to annotate coroutines as follows: 735 736 851 \begin{cfa} 737 852 coroutine Fibonacci { … … 746 861 The reserved keywords are only present to improve ease of use for the common cases. 747 862 748 \subsection{Alternative: Lambda Objects} 863 864 \subsubsection{Alternative: Lambda Objects} 749 865 750 866 For coroutines as for threads, many implementations are based on routine pointers or function objects~\cite{Butenhof97, C++14, MS:VisualC++, BoostCoroutines15}. … … 776 892 As discussed in section \ref{threads}, this approach is superseded by static approaches in terms of expressivity. 777 893 778 \subsection{Alternative: Trait-Based Coroutines} 894 895 \subsubsection{Alternative: Trait-Based Coroutines} 779 896 780 897 Finally, the underlying approach, which is the one closest to \CFA idioms, is to use trait-based lazy coroutines. … … 821 938 The combination of these two approaches allows users new to coroutining and concurrency to have an easy and concise specification, while more advanced users have tighter control on memory layout and initialization. 822 939 823 \s ection{Thread Interface}\label{threads}940 \subsection{Thread Interface}\label{threads} 824 941 The basic building blocks of multithreading in \CFA are \textbf{cfathread}. 825 942 Both user and kernel threads are supported, where user threads are the concurrency mechanism and kernel threads are the parallel mechanism. … … 929 1046 \end{cfa} 930 1047 931 However, one of the drawbacks of this approach is that threads always form a tree where nodes must always outlive their children, i.e.,they are always destroyed in the opposite order of construction because of C scoping rules.1048 However, one of the drawbacks of this approach is that threads always form a tree where nodes must always outlive their children, \ie they are always destroyed in the opposite order of construction because of C scoping rules. 932 1049 This restriction is relaxed by using dynamic allocation, so threads can outlive the scope in which they are created, much like dynamically allocating memory lets objects outlive the scope in which they are created. 933 1050 … … 970 1087 Since many of these challenges appear with the use of mutable shared state, some languages and libraries simply disallow mutable shared state (Erlang~\cite{Erlang}, Haskell~\cite{Haskell}, Akka (Scala)~\cite{Akka}). 971 1088 In these paradigms, interaction among concurrent objects relies on message passing~\cite{Thoth,Harmony,V-Kernel} or other paradigms closely relate to networking concepts (channels~\cite{CSP,Go} for example). 972 However, in languages that use routine calls as their core abstraction mechanism, these approaches force a clear distinction between concurrent and non-concurrent paradigms ( i.e.,message passing versus routine calls).1089 However, in languages that use routine calls as their core abstraction mechanism, these approaches force a clear distinction between concurrent and non-concurrent paradigms (\ie message passing versus routine calls). 973 1090 This distinction in turn means that, in order to be effective, programmers need to learn two sets of design patterns. 974 1091 While this distinction can be hidden away in library code, effective use of the library still has to take both paradigms into account. … … 984 1101 One of the most natural, elegant, and efficient mechanisms for synchronization and communication, especially for shared-memory systems, is the \emph{monitor}. 985 1102 Monitors were first proposed by Brinch Hansen~\cite{Hansen73} and later described and extended by C.A.R.~Hoare~\cite{Hoare74}. 986 Many programming languages--- e.g.,Concurrent Pascal~\cite{ConcurrentPascal}, Mesa~\cite{Mesa}, Modula~\cite{Modula-2}, Turing~\cite{Turing:old}, Modula-3~\cite{Modula-3}, NeWS~\cite{NeWS}, Emerald~\cite{Emerald}, \uC~\cite{Buhr92a} and Java~\cite{Java}---provide monitors as explicit language constructs.1103 Many programming languages---\eg Concurrent Pascal~\cite{ConcurrentPascal}, Mesa~\cite{Mesa}, Modula~\cite{Modula-2}, Turing~\cite{Turing:old}, Modula-3~\cite{Modula-3}, NeWS~\cite{NeWS}, Emerald~\cite{Emerald}, \uC~\cite{Buhr92a} and Java~\cite{Java}---provide monitors as explicit language constructs. 987 1104 In addition, operating-system kernels and device drivers have a monitor-like structure, although they often use lower-level primitives such as semaphores or locks to simulate monitors. 988 1105 For these reasons, this project proposes monitors as the core concurrency construct. 989 1106 990 \section{Basics} 1107 1108 \subsection{Basics} 1109 991 1110 Non-determinism requires concurrent systems to offer support for mutual-exclusion and synchronization. 992 1111 Mutual-exclusion is the concept that only a fixed number of threads can access a critical section at any given time, where a critical section is a group of instructions on an associated portion of data that requires the restricted access. 993 1112 On the other hand, synchronization enforces relative ordering of execution and synchronization tools provide numerous mechanisms to establish timing relationships among threads. 994 1113 995 \subsection{Mutual-Exclusion} 1114 1115 \subsubsection{Mutual-Exclusion} 1116 996 1117 As mentioned above, mutual-exclusion is the guarantee that only a fix number of threads can enter a critical section at once. 997 1118 However, many solutions exist for mutual exclusion, which vary in terms of performance, flexibility and ease of use. 998 1119 Methods range from low-level locks, which are fast and flexible but require significant attention to be correct, to higher-level concurrency techniques, which sacrifice some performance in order to improve ease of use. 999 Ease of use comes by either guaranteeing some problems cannot occur ( e.g.,being deadlock free) or by offering a more explicit coupling between data and corresponding critical section.1000 For example, the \CC @std::atomic<T>@ offers an easy way to express mutual-exclusion on a restricted set of operations ( e.g.,reading/writing large types atomically).1120 Ease of use comes by either guaranteeing some problems cannot occur (\eg being deadlock free) or by offering a more explicit coupling between data and corresponding critical section. 1121 For example, the \CC @std::atomic<T>@ offers an easy way to express mutual-exclusion on a restricted set of operations (\eg reading/writing large types atomically). 1001 1122 Another challenge with low-level locks is composability. 1002 1123 Locks have restricted composability because it takes careful organizing for multiple locks to be used while preventing deadlocks. 1003 1124 Easing composability is another feature higher-level mutual-exclusion mechanisms often offer. 1004 1125 1005 \subsection{Synchronization} 1126 1127 \subsubsection{Synchronization} 1128 1006 1129 As with mutual-exclusion, low-level synchronization primitives often offer good performance and good flexibility at the cost of ease of use. 1007 Again, higher-level mechanisms often simplify usage by adding either better coupling between synchronization and data ( e.g.,message passing) or offering a simpler solution to otherwise involved challenges.1130 Again, higher-level mechanisms often simplify usage by adding either better coupling between synchronization and data (\eg message passing) or offering a simpler solution to otherwise involved challenges. 1008 1131 As mentioned above, synchronization can be expressed as guaranteeing that event \textit{X} always happens before \textit{Y}. 1009 1132 Most of the time, synchronization happens within a critical section, where threads must acquire mutual-exclusion in a certain order. … … 1016 1139 Algorithms that use flag variables to detect barging threads are said to be using barging avoidance, while algorithms that baton-pass locks~\cite{Andrews89} between threads instead of releasing the locks are said to be using barging prevention. 1017 1140 1141 1018 1142 % ====================================================================== 1019 1143 % ====================================================================== … … 1049 1173 Another aspect to consider is when a monitor acquires its mutual exclusion. 1050 1174 For example, a monitor may need to be passed through multiple helper routines that do not acquire the monitor mutual-exclusion on entry. 1051 Passthrough can occur for generic helper routines (@swap@, @sort@, etc.) or specific helper routines like the following to implement an atomic counter:1175 Passthrough can occur for generic helper routines (@swap@, @sort@, \etc) or specific helper routines like the following to implement an atomic counter: 1052 1176 1053 1177 \begin{cfa} … … 1207 1331 1208 1332 The call semantics discussed above have one software engineering issue: only a routine can acquire the mutual-exclusion of a set of monitor. \CFA offers the @mutex@ statement to work around the need for unnecessary names, avoiding a major software engineering problem~\cite{2FTwoHardThings}. 1209 Table \ref{ lst:mutex-stmt} shows an example of the @mutex@ statement, which introduces a new scope in which the mutual-exclusion of a set of monitor is acquired.1333 Table \ref{f:mutex-stmt} shows an example of the @mutex@ statement, which introduces a new scope in which the mutual-exclusion of a set of monitor is acquired. 1210 1334 Beyond naming, the @mutex@ statement has no semantic difference from a routine call with @mutex@ parameters. 1211 1335 … … 1237 1361 \end{center} 1238 1362 \caption{Regular call semantics vs. \protect\lstinline|mutex| statement} 1239 \label{ lst:mutex-stmt}1363 \label{f:mutex-stmt} 1240 1364 \end{table} 1241 1365 … … 1286 1410 In addition to mutual exclusion, the monitors at the core of \CFA's concurrency can also be used to achieve synchronization. 1287 1411 With monitors, this capability is generally achieved with internal or external scheduling as in~\cite{Hoare74}. 1288 With \textbf{scheduling} loosely defined as deciding which thread acquires the critical section next, \textbf{internal scheduling} means making the decision from inside the critical section ( i.e., with access to the shared state), while \textbf{external scheduling} means making the decision when entering the critical section (i.e.,without access to the shared state).1412 With \textbf{scheduling} loosely defined as deciding which thread acquires the critical section next, \textbf{internal scheduling} means making the decision from inside the critical section (\ie with access to the shared state), while \textbf{external scheduling} means making the decision when entering the critical section (\ie without access to the shared state). 1289 1413 Since internal scheduling within a single monitor is mostly a solved problem, this paper concentrates on extending internal scheduling to multiple monitors. 1290 1414 Indeed, like the \textbf{bulk-acq} semantics, internal scheduling extends to multiple monitors in a way that is natural to the user but requires additional complexity on the implementation side. … … 1313 1437 There are two details to note here. 1314 1438 First, @signal@ is a delayed operation; it only unblocks the waiting thread when it reaches the end of the critical section. 1315 This semantics is needed to respect mutual-exclusion, i.e.,the signaller and signalled thread cannot be in the monitor simultaneously.1439 This semantics is needed to respect mutual-exclusion, \ie the signaller and signalled thread cannot be in the monitor simultaneously. 1316 1440 The alternative is to return immediately after the call to @signal@, which is significantly more restrictive. 1317 1441 Second, in \CFA, while it is common to store a @condition@ as a field of the monitor, a @condition@ variable can be stored/created independently of a monitor. … … 1431 1555 1432 1556 A larger example is presented to show complex issues for \textbf{bulk-acq} and its implementation options are analyzed. 1433 Listing \ref{lst:int-bulk-cfa} shows an example where \textbf{bulk-acq} adds a significant layer of complexity to the internal signalling semantics, and listing \ref{lst:int-bulk-cfa} shows the corresponding \CFA code to implement the cfa-code in listing \ref{lst:int-bulk-cfa}.1434 For the purpose of translating the given cfa-code into \CFA-code, any method of introducing a monitor is acceptable, e.g.,@mutex@ parameters, global variables, pointer parameters, or using locals with the @mutex@ statement.1557 Figure~\ref{f:int-bulk-cfa} shows an example where \textbf{bulk-acq} adds a significant layer of complexity to the internal signalling semantics, and listing \ref{f:int-bulk-cfa} shows the corresponding \CFA code to implement the cfa-code in listing \ref{f:int-bulk-cfa}. 1558 For the purpose of translating the given cfa-code into \CFA-code, any method of introducing a monitor is acceptable, \eg @mutex@ parameters, global variables, pointer parameters, or using locals with the @mutex@ statement. 1435 1559 1436 1560 \begin{figure} … … 1462 1586 \end{cfa} 1463 1587 \end{multicols} 1464 \begin{cfa}[caption={Internal scheduling with \textbf{bulk-acq}},label={ lst:int-bulk-cfa}]1588 \begin{cfa}[caption={Internal scheduling with \textbf{bulk-acq}},label={f:int-bulk-cfa}] 1465 1589 \end{cfa} 1466 1590 \begin{center} … … 1498 1622 \end{cfa} 1499 1623 \end{multicols} 1500 \begin{cfa}[caption={Equivalent \CFA code for listing \ref{ lst:int-bulk-cfa}},label={lst:int-bulk-cfa}]1624 \begin{cfa}[caption={Equivalent \CFA code for listing \ref{f:int-bulk-cfa}},label={f:int-bulk-cfa}] 1501 1625 \end{cfa} 1502 1626 \begin{multicols}{2} … … 1523 1647 \end{cfa} 1524 1648 \end{multicols} 1525 \begin{cfa}[caption={ Listing \ref{lst:int-bulk-cfa}, with delayed signalling comments},label={lst:int-secret}]1649 \begin{cfa}[caption={Figure~\ref{f:int-bulk-cfa}, with delayed signalling comments},label={f:int-secret}] 1526 1650 \end{cfa} 1527 1651 \end{figure} 1528 1652 1529 The complexity begins at code sections 4 and 8 in listing \ref{ lst:int-bulk-cfa}, which are where the existing semantics of internal scheduling needs to be extended for multiple monitors.1653 The complexity begins at code sections 4 and 8 in listing \ref{f:int-bulk-cfa}, which are where the existing semantics of internal scheduling needs to be extended for multiple monitors. 1530 1654 The root of the problem is that \textbf{bulk-acq} is used in a context where one of the monitors is already acquired, which is why it is important to define the behaviour of the previous cfa-code. 1531 When the signaller thread reaches the location where it should ``release @A & B@'' (listing \ref{ lst:int-bulk-cfa} line \ref{line:releaseFirst}), it must actually transfer ownership of monitor @B@ to the waiting thread.1655 When the signaller thread reaches the location where it should ``release @A & B@'' (listing \ref{f:int-bulk-cfa} line \ref{line:releaseFirst}), it must actually transfer ownership of monitor @B@ to the waiting thread. 1532 1656 This ownership transfer is required in order to prevent barging into @B@ by another thread, since both the signalling and signalled threads still need monitor @A@. 1533 1657 There are three options: … … 1538 1662 This solution has the main benefit of transferring ownership of groups of monitors, which simplifies the semantics from multiple objects to a single group of objects, effectively making the existing single-monitor semantic viable by simply changing monitors to monitor groups. 1539 1663 This solution releases the monitors once every monitor in a group can be released. 1540 However, since some monitors are never released ( e.g.,the monitor of a thread), this interpretation means a group might never be released.1664 However, since some monitors are never released (\eg the monitor of a thread), this interpretation means a group might never be released. 1541 1665 A more interesting interpretation is to transfer the group until all its monitors are released, which means the group is not passed further and a thread can retain its locks. 1542 1666 1543 However, listing \ref{ lst:int-secret} shows this solution can become much more complicated depending on what is executed while secretly holding B at line \ref{line:secret}, while avoiding the need to transfer ownership of a subset of the condition monitors.1544 Listing \ref{lst:dependency} shows a slightly different example where a third thread is waiting on monitor @A@, using a different condition variable.1667 However, listing \ref{f:int-secret} shows this solution can become much more complicated depending on what is executed while secretly holding B at line \ref{line:secret}, while avoiding the need to transfer ownership of a subset of the condition monitors. 1668 Figure~\ref{f:dependency} shows a slightly different example where a third thread is waiting on monitor @A@, using a different condition variable. 1545 1669 Because the third thread is signalled when secretly holding @B@, the goal becomes unreachable. 1546 Depending on the order of signals (listing \ref{ lst:dependency} line \ref{line:signal-ab} and \ref{line:signal-a}) two cases can happen:1670 Depending on the order of signals (listing \ref{f:dependency} line \ref{line:signal-ab} and \ref{line:signal-a}) two cases can happen: 1547 1671 1548 1672 \paragraph{Case 1: thread $\alpha$ goes first.} In this case, the problem is that monitor @A@ needs to be passed to thread $\beta$ when thread $\alpha$ is done with it. … … 1551 1675 1552 1676 Note that ordering is not determined by a race condition but by whether signalled threads are enqueued in FIFO or FILO order. 1553 However, regardless of the answer, users can move line \ref{line:signal-a} before line \ref{line:signal-ab} and get the reverse effect for listing \ref{ lst:dependency}.1677 However, regardless of the answer, users can move line \ref{line:signal-a} before line \ref{line:signal-ab} and get the reverse effect for listing \ref{f:dependency}. 1554 1678 1555 1679 In both cases, the threads need to be able to distinguish, on a per monitor basis, which ones need to be released and which ones need to be transferred, which means knowing when to release a group becomes complex and inefficient (see next section) and therefore effectively precludes this approach. … … 1586 1710 \end{cfa} 1587 1711 \end{multicols} 1588 \begin{cfa}[caption={Pseudo-code for the three thread example.},label={ lst:dependency}]1712 \begin{cfa}[caption={Pseudo-code for the three thread example.},label={f:dependency}] 1589 1713 \end{cfa} 1590 1714 \begin{center} 1591 1715 \input{dependency} 1592 1716 \end{center} 1593 \caption{Dependency graph of the statements in listing \ref{ lst:dependency}}1717 \caption{Dependency graph of the statements in listing \ref{f:dependency}} 1594 1718 \label{fig:dependency} 1595 1719 \end{figure} 1596 1720 1597 In listing \ref{ lst:int-bulk-cfa}, there is a solution that satisfies both barging prevention and mutual exclusion.1721 In listing \ref{f:int-bulk-cfa}, there is a solution that satisfies both barging prevention and mutual exclusion. 1598 1722 If ownership of both monitors is transferred to the waiter when the signaller releases @A & B@ and then the waiter transfers back ownership of @A@ back to the signaller when it releases it, then the problem is solved (@B@ is no longer in use at this point). 1599 1723 Dynamically finding the correct order is therefore the second possible solution. 1600 1724 The problem is effectively resolving a dependency graph of ownership requirements. 1601 1725 Here even the simplest of code snippets requires two transfers and has a super-linear complexity. 1602 This complexity can be seen in listing \ref{ lst:explosion}, which is just a direct extension to three monitors, requires at least three ownership transfer and has multiple solutions.1726 This complexity can be seen in listing \ref{f:explosion}, which is just a direct extension to three monitors, requires at least three ownership transfer and has multiple solutions. 1603 1727 Furthermore, the presence of multiple solutions for ownership transfer can cause deadlock problems if a specific solution is not consistently picked; In the same way that multiple lock acquiring order can cause deadlocks. 1604 1728 \begin{figure} … … 1626 1750 \end{cfa} 1627 1751 \end{multicols} 1628 \begin{cfa}[caption={Extension to three monitors of listing \ref{ lst:int-bulk-cfa}},label={lst:explosion}]1752 \begin{cfa}[caption={Extension to three monitors of listing \ref{f:int-bulk-cfa}},label={f:explosion}] 1629 1753 \end{cfa} 1630 1754 \end{figure} 1631 1755 1632 Given the three threads example in listing \ref{ lst:dependency}, figure \ref{fig:dependency} shows the corresponding dependency graph that results, where every node is a statement of one of the three threads, and the arrows the dependency of that statement (e.g.,$\alpha1$ must happen before $\alpha2$).1756 Given the three threads example in listing \ref{f:dependency}, figure \ref{fig:dependency} shows the corresponding dependency graph that results, where every node is a statement of one of the three threads, and the arrows the dependency of that statement (\eg $\alpha1$ must happen before $\alpha2$). 1633 1757 The extra challenge is that this dependency graph is effectively post-mortem, but the runtime system needs to be able to build and solve these graphs as the dependencies unfold. 1634 1758 Resolving dependency graphs being a complex and expensive endeavour, this solution is not the preferred one. … … 1636 1760 \subsubsection{Partial Signalling} \label{partial-sig} 1637 1761 Finally, the solution that is chosen for \CFA is to use partial signalling. 1638 Again using listing \ref{ lst:int-bulk-cfa}, the partial signalling solution transfers ownership of monitor @B@ at lines \ref{line:signal1} to the waiter but does not wake the waiting thread since it is still using monitor @A@.1762 Again using listing \ref{f:int-bulk-cfa}, the partial signalling solution transfers ownership of monitor @B@ at lines \ref{line:signal1} to the waiter but does not wake the waiting thread since it is still using monitor @A@. 1639 1763 Only when it reaches line \ref{line:lastRelease} does it actually wake up the waiting thread. 1640 1764 This solution has the benefit that complexity is encapsulated into only two actions: passing monitors to the next owner when they should be released and conditionally waking threads if all conditions are met. … … 1642 1766 Furthermore, after being fully implemented, this solution does not appear to have any significant downsides. 1643 1767 1644 Using partial signalling, listing \ref{ lst:dependency} can be solved easily:1768 Using partial signalling, listing \ref{f:dependency} can be solved easily: 1645 1769 \begin{itemize} 1646 1770 \item When thread $\gamma$ reaches line \ref{line:release-ab} it transfers monitor @B@ to thread $\alpha$ and continues to hold monitor @A@. … … 1807 1931 This method is more constrained and explicit, which helps users reduce the non-deterministic nature of concurrency. 1808 1932 Indeed, as the following examples demonstrate, external scheduling allows users to wait for events from other threads without the concern of unrelated events occurring. 1809 External scheduling can generally be done either in terms of control flow ( e.g., Ada with @accept@, \uC with @_Accept@) or in terms of data (e.g.,Go with channels).1933 External scheduling can generally be done either in terms of control flow (\eg Ada with @accept@, \uC with @_Accept@) or in terms of data (\eg Go with channels). 1810 1934 Of course, both of these paradigms have their own strengths and weaknesses, but for this project, control-flow semantics was chosen to stay consistent with the rest of the languages semantics. 1811 1935 Two challenges specific to \CFA arise when trying to add external scheduling with loose object definitions and multiple-monitor routines. … … 1873 1997 1874 1998 There are other alternatives to these pictures, but in the case of the left picture, implementing a fast accept check is relatively easy. 1875 Restricted to a fixed number of mutex members, N, the accept check reduces to updating a bitmask when the acceptor queue changes, a check that executes in a single instruction even with a fairly large number ( e.g.,128) of mutex members.1999 Restricted to a fixed number of mutex members, N, the accept check reduces to updating a bitmask when the acceptor queue changes, a check that executes in a single instruction even with a fairly large number (\eg 128) of mutex members. 1876 2000 This approach requires a unique dense ordering of routines with an upper-bound and that ordering must be consistent across translation units. 1877 2001 For OO languages these constraints are common, since objects only offer adding member routines consistently across translation units via inheritance. … … 1883 2007 Generating a mask dynamically means that the storage for the mask information can vary between calls to @waitfor@, allowing for more flexibility and extensions. 1884 2008 Storing an array of accepted function pointers replaces the single instruction bitmask comparison with dereferencing a pointer followed by a linear search. 1885 Furthermore, supporting nested external scheduling ( e.g., listing \ref{lst:nest-ext}) may now require additional searches for the @waitfor@ statement to check if a routine is already queued.2009 Furthermore, supporting nested external scheduling (\eg listing \ref{f:nest-ext}) may now require additional searches for the @waitfor@ statement to check if a routine is already queued. 1886 2010 1887 2011 \begin{figure} 1888 \begin{cfa}[caption={Example of nested external scheduling},label={ lst:nest-ext}]2012 \begin{cfa}[caption={Example of nested external scheduling},label={f:nest-ext}] 1889 2013 monitor M {}; 1890 2014 void foo( M & mutex a ) {} … … 1991 2115 While the set of monitors can be any list of expressions, the function name is more restricted because the compiler validates at compile time the validity of the function type and the parameters used with the @waitfor@ statement. 1992 2116 It checks that the set of monitors passed in matches the requirements for a function call. 1993 Listing \ref{lst:waitfor} shows various usages of the waitfor statement and which are acceptable.2117 Figure~\ref{f:waitfor} shows various usages of the waitfor statement and which are acceptable. 1994 2118 The choice of the function type is made ignoring any non-@mutex@ parameter. 1995 2119 One limitation of the current implementation is that it does not handle overloading, but overloading is possible. 1996 2120 \begin{figure} 1997 \begin{cfa}[caption={Various correct and incorrect uses of the waitfor statement},label={ lst:waitfor}]2121 \begin{cfa}[caption={Various correct and incorrect uses of the waitfor statement},label={f:waitfor}] 1998 2122 monitor A{}; 1999 2123 monitor B{}; … … 2032 2156 A @waitfor@ chain can also be followed by a @timeout@, to signify an upper bound on the wait, or an @else@, to signify that the call should be non-blocking, which checks for a matching function call already arrived and otherwise continues. 2033 2157 Any and all of these clauses can be preceded by a @when@ condition to dynamically toggle the accept clauses on or off based on some current state. 2034 Listing \ref{lst:waitfor2} demonstrates several complex masks and some incorrect ones.2158 Figure~\ref{f:waitfor2} demonstrates several complex masks and some incorrect ones. 2035 2159 2036 2160 \begin{figure} … … 2082 2206 \end{cfa} 2083 2207 \caption{Correct and incorrect uses of the or, else, and timeout clause around a waitfor statement} 2084 \label{ lst:waitfor2}2208 \label{f:waitfor2} 2085 2209 \end{figure} 2086 2210 … … 2096 2220 However, a more expressive approach is to flip ordering of execution when waiting for the destructor, meaning that waiting for the destructor allows the destructor to run after the current @mutex@ routine, similarly to how a condition is signalled. 2097 2221 \begin{figure} 2098 \begin{cfa}[caption={Example of an executor which executes action in series until the destructor is called.},label={ lst:dtor-order}]2222 \begin{cfa}[caption={Example of an executor which executes action in series until the destructor is called.},label={f:dtor-order}] 2099 2223 monitor Executer {}; 2100 2224 struct Action; … … 2112 2236 \end{cfa} 2113 2237 \end{figure} 2114 For example, listing \ref{ lst:dtor-order} shows an example of an executor with an infinite loop, which waits for the destructor to break out of this loop.2238 For example, listing \ref{f:dtor-order} shows an example of an executor with an infinite loop, which waits for the destructor to break out of this loop. 2115 2239 Switching the semantic meaning introduces an idiomatic way to terminate a task and/or wait for its termination via destruction. 2116 2240 … … 2128 2252 In this decade, it is no longer reasonable to create a high-performance application without caring about parallelism. 2129 2253 Indeed, parallelism is an important aspect of performance and more specifically throughput and hardware utilization. 2130 The lowest-level approach of parallelism is to use \textbf{kthread} in combination with semantics like @fork@, @join@, etc.2254 The lowest-level approach of parallelism is to use \textbf{kthread} in combination with semantics like @fork@, @join@, \etc. 2131 2255 However, since these have significant costs and limitations, \textbf{kthread} are now mostly used as an implementation tool rather than a user oriented one. 2132 2256 There are several alternatives to solve these issues that all have strengths and weaknesses. … … 2166 2290 While the choice between the three paradigms listed above may have significant performance implications, it is difficult to pin down the performance implications of choosing a model at the language level. 2167 2291 Indeed, in many situations one of these paradigms may show better performance but it all strongly depends on the workload. 2168 Having a large amount of mostly independent units of work to execute almost guarantees equivalent performance across paradigms and that the \textbf{pool}-based system has the best efficiency thanks to the lower memory overhead ( i.e.,no thread stack per job).2292 Having a large amount of mostly independent units of work to execute almost guarantees equivalent performance across paradigms and that the \textbf{pool}-based system has the best efficiency thanks to the lower memory overhead (\ie no thread stack per job). 2169 2293 However, interactions among jobs can easily exacerbate contention. 2170 2294 User-level threads allow fine-grain context switching, which results in better resource utilization, but a context switch is more expensive and the extra control means users need to tweak more variables to get the desired performance. … … 2218 2342 2219 2343 The first step towards the monitor implementation is simple @mutex@ routines. 2220 In the single monitor case, mutual-exclusion is done using the entry/exit procedure in listing \ref{ lst:entry1}.2344 In the single monitor case, mutual-exclusion is done using the entry/exit procedure in listing \ref{f:entry1}. 2221 2345 The entry/exit procedures do not have to be extended to support multiple monitors. 2222 2346 Indeed it is sufficient to enter/leave monitors one-by-one as long as the order is correct to prevent deadlock~\cite{Havender68}. … … 2246 2370 \end{cfa} 2247 2371 \end{multicols} 2248 \begin{cfa}[caption={Initial entry and exit routine for monitors},label={ lst:entry1}]2372 \begin{cfa}[caption={Initial entry and exit routine for monitors},label={f:entry1}] 2249 2373 \end{cfa} 2250 2374 \end{figure} … … 2256 2380 First of all, interaction between @otype@ polymorphism (see Section~\ref{s:ParametricPolymorphism}) and monitors is impossible since monitors do not support copying. 2257 2381 Therefore, the main question is how to support @dtype@ polymorphism. 2258 It is important to present the difference between the two acquiring options: \textbf{callsite-locking} and entry-point locking, i.e.,acquiring the monitors before making a mutex routine-call or as the first operation of the mutex routine-call.2382 It is important to present the difference between the two acquiring options: \textbf{callsite-locking} and entry-point locking, \ie acquiring the monitors before making a mutex routine-call or as the first operation of the mutex routine-call. 2259 2383 For example: 2260 2384 \begin{table} … … 2313 2437 \end{table} 2314 2438 2315 Note the @mutex@ keyword relies on the type system, which means that in cases where a generic monitor-routine is desired, writing the mutex routine is possible with the proper trait, e.g.:2439 Note the @mutex@ keyword relies on the type system, which means that in cases where a generic monitor-routine is desired, writing the mutex routine is possible with the proper trait, \eg: 2316 2440 \begin{cfa} 2317 2441 // Incorrect: T may not be monitor … … 2326 2450 Both entry point and \textbf{callsite-locking} are feasible implementations. 2327 2451 The current \CFA implementation uses entry-point locking because it requires less work when using \textbf{raii}, effectively transferring the burden of implementation to object construction/destruction. 2328 It is harder to use \textbf{raii} for call-site locking, as it does not necessarily have an existing scope that matches exactly the scope of the mutual exclusion, i.e.,the function body.2452 It is harder to use \textbf{raii} for call-site locking, as it does not necessarily have an existing scope that matches exactly the scope of the mutual exclusion, \ie the function body. 2329 2453 For example, the monitor call can appear in the middle of an expression. 2330 2454 Furthermore, entry-point locking requires less code generation since any useful routine is called multiple times but there is only one entry point for many call sites. … … 2359 2483 Specifically, all @pthread@s created also have a stack created with them, which should be used as much as possible. 2360 2484 Normally, coroutines also create their own stack to run on, however, in the case of the coroutines used for processors, these coroutines run directly on the \textbf{kthread} stack, effectively stealing the processor stack. 2361 The exception to this rule is the Main Processor, i.e.,the initial \textbf{kthread} that is given to any program.2485 The exception to this rule is the Main Processor, \ie the initial \textbf{kthread} that is given to any program. 2362 2486 In order to respect C user expectations, the stack of the initial kernel thread, the main stack of the program, is used by the main user thread rather than the main processor, which can grow very large. 2363 2487 … … 2390 2514 When the preemption system receives a change in preemption, it inserts the time in a sorted order and sets a kernel timer for the closest one, effectively stepping through preemption events on each signal sent by the timer. 2391 2515 These timers use the Linux signal {\tt SIGALRM}, which is delivered to the process rather than the kernel-thread. 2392 This results in an implementation problem, because when delivering signals to a process, the kernel can deliver the signal to any kernel thread for which the signal is not blocked, i.e.:2516 This results in an implementation problem, because when delivering signals to a process, the kernel can deliver the signal to any kernel thread for which the signal is not blocked, \ie: 2393 2517 \begin{quote} 2394 2518 A process-directed signal may be delivered to any one of the threads that does not currently have the signal blocked. … … 2406 2530 However, since the kernel thread handling preemption requires a different signal mask, executing user threads on the kernel-alarm thread can cause deadlocks. 2407 2531 For this reason, the alarm thread is in a tight loop around a system call to @sigwaitinfo@, requiring very little CPU time for preemption. 2408 One final detail about the alarm thread is how to wake it when additional communication is required ( e.g.,on thread termination).2532 One final detail about the alarm thread is how to wake it when additional communication is required (\eg on thread termination). 2409 2533 This unblocking is also done using {\tt SIGALRM}, but sent through the @pthread_sigqueue@. 2410 2534 Indeed, @sigwait@ can differentiate signals sent from @pthread_sigqueue@ from signals sent from alarms or the kernel. … … 2445 2569 \end{figure} 2446 2570 2447 This picture and the proper entry and leave algorithms (see listing \ref{ lst:entry2}) is the fundamental implementation of internal scheduling.2571 This picture and the proper entry and leave algorithms (see listing \ref{f:entry2}) is the fundamental implementation of internal scheduling. 2448 2572 Note that when a thread is moved from the condition to the AS-stack, it is conceptually split into N pieces, where N is the number of monitors specified in the parameter list. 2449 2573 The thread is woken up when all the pieces have popped from the AS-stacks and made active. … … 2478 2602 \end{cfa} 2479 2603 \end{multicols} 2480 \begin{cfa}[caption={Entry and exit routine for monitors with internal scheduling},label={ lst:entry2}]2604 \begin{cfa}[caption={Entry and exit routine for monitors with internal scheduling},label={f:entry2}] 2481 2605 \end{cfa} 2482 2606 \end{figure} 2483 2607 2484 The solution discussed in \ref{intsched} can be seen in the exit routine of listing \ref{ lst:entry2}.2608 The solution discussed in \ref{intsched} can be seen in the exit routine of listing \ref{f:entry2}. 2485 2609 Basically, the solution boils down to having a separate data structure for the condition queue and the AS-stack, and unconditionally transferring ownership of the monitors but only unblocking the thread when the last monitor has transferred ownership. 2486 2610 This solution is deadlock safe as well as preventing any potential barging. … … 2498 2622 The main idea behind them is that, a thread cannot contain an arbitrary number of intrusive ``next'' pointers for linking onto monitors. 2499 2623 The @condition node@ is the data structure that is queued onto a condition variable and, when signalled, the condition queue is popped and each @condition criterion@ is moved to the AS-stack. 2500 Once all the criteria have been popped from their respective AS-stacks, the thread is woken up, which is what is shown in listing \ref{ lst:entry2}.2624 Once all the criteria have been popped from their respective AS-stacks, the thread is woken up, which is what is shown in listing \ref{f:entry2}. 2501 2625 2502 2626 % ====================================================================== … … 2506 2630 % ====================================================================== 2507 2631 Similarly to internal scheduling, external scheduling for multiple monitors relies on the idea that waiting-thread queues are no longer specific to a single monitor, as mentioned in section \ref{extsched}. 2508 For internal scheduling, these queues are part of condition variables, which are still unique for a given scheduling operation ( i.e.,no signal statement uses multiple conditions).2632 For internal scheduling, these queues are part of condition variables, which are still unique for a given scheduling operation (\ie no signal statement uses multiple conditions). 2509 2633 However, in the case of external scheduling, there is no equivalent object which is associated with @waitfor@ statements. 2510 2634 This absence means the queues holding the waiting threads must be stored inside at least one of the monitors that is acquired. … … 2533 2657 Note that if a thread has acquired two monitors but executes a @waitfor@ with only one monitor as a parameter, setting the mask of acceptable routines to both monitors will not cause any problems since the extra monitor will not change ownership regardless. 2534 2658 This becomes relevant when @when@ clauses affect the number of monitors passed to a @waitfor@ statement. 2535 \item The entry/exit routines need to be updated as shown in listing \ref{ lst:entry3}.2659 \item The entry/exit routines need to be updated as shown in listing \ref{f:entry3}. 2536 2660 \end{itemize} 2537 2661 … … 2541 2665 Indeed, when waiting for the destructors, storage is needed for the waiting context and the lifetime of said storage needs to outlive the waiting operation it is needed for. 2542 2666 For regular @waitfor@ statements, the call stack of the routine itself matches this requirement but it is no longer the case when waiting for the destructor since it is pushed on to the AS-stack for later. 2543 The @waitfor@ semantics can then be adjusted correspondingly, as seen in listing \ref{ lst:entry-dtor}2667 The @waitfor@ semantics can then be adjusted correspondingly, as seen in listing \ref{f:entry-dtor} 2544 2668 2545 2669 \begin{figure} … … 2575 2699 \end{cfa} 2576 2700 \end{multicols} 2577 \begin{cfa}[caption={Entry and exit routine for monitors with internal scheduling and external scheduling},label={ lst:entry3}]2701 \begin{cfa}[caption={Entry and exit routine for monitors with internal scheduling and external scheduling},label={f:entry3}] 2578 2702 \end{cfa} 2579 2703 \end{figure} … … 2621 2745 \end{cfa} 2622 2746 \end{multicols} 2623 \begin{cfa}[caption={Pseudo code for the \protect\lstinline|waitfor| routine and the \protect\lstinline|mutex| entry routine for destructors},label={ lst:entry-dtor}]2747 \begin{cfa}[caption={Pseudo code for the \protect\lstinline|waitfor| routine and the \protect\lstinline|mutex| entry routine for destructors},label={f:entry-dtor}] 2624 2748 \end{cfa} 2625 2749 \end{figure} … … 2637 2761 For example, here is a very simple two thread pipeline that could be used for a simulator of a game engine: 2638 2762 \begin{figure} 2639 \begin{cfa}[caption={Toy simulator using \protect\lstinline|thread|s and \protect\lstinline|monitor|s.},label={ lst:engine-v1}]2763 \begin{cfa}[caption={Toy simulator using \protect\lstinline|thread|s and \protect\lstinline|monitor|s.},label={f:engine-v1}] 2640 2764 // Visualization declaration 2641 2765 thread Renderer {} renderer; … … 2669 2793 Luckily, the monitor semantics can also be used to clearly enforce a shutdown order in a concise manner: 2670 2794 \begin{figure} 2671 \begin{cfa}[caption={Same toy simulator with proper termination condition.},label={ lst:engine-v2}]2795 \begin{cfa}[caption={Same toy simulator with proper termination condition.},label={f:engine-v2}] 2672 2796 // Visualization declaration 2673 2797 thread Renderer {} renderer; … … 2718 2842 } 2719 2843 \end{cfa} 2720 This function is called by the kernel to fetch the default preemption rate, where 0 signifies an infinite time-slice, i.e.,no preemption.2721 However, once clusters are fully implemented, it will be possible to create fibers and \textbf{uthread} in the same system, as in listing \ref{ lst:fiber-uthread}2844 This function is called by the kernel to fetch the default preemption rate, where 0 signifies an infinite time-slice, \ie no preemption. 2845 However, once clusters are fully implemented, it will be possible to create fibers and \textbf{uthread} in the same system, as in listing \ref{f:fiber-uthread} 2722 2846 \begin{figure} 2723 2847 \lstset{language=CFA,deletedelim=**[is][]{`}{`}} 2724 \begin{cfa}[caption={Using fibers and \textbf{uthread} side-by-side in \CFA},label={ lst:fiber-uthread}]2848 \begin{cfa}[caption={Using fibers and \textbf{uthread} side-by-side in \CFA},label={f:fiber-uthread}] 2725 2849 // Cluster forward declaration 2726 2850 struct cluster; … … 2831 2955 Yielding causes the thread to context-switch to the scheduler and back, more precisely: from the \textbf{uthread} to the \textbf{kthread} then from the \textbf{kthread} back to the same \textbf{uthread} (or a different one in the general case). 2832 2956 In order to make the comparison fair, coroutines also execute a 2-step context-switch by resuming another coroutine which does nothing but suspending in a tight loop, which is a resume/suspend cycle instead of a yield. 2833 Listing \ref{lst:ctx-switch} shows the code for coroutines and threads with the results in table \ref{tab:ctx-switch}.2957 Figure~\ref{f:ctx-switch} shows the code for coroutines and threads with the results in table \ref{tab:ctx-switch}. 2834 2958 All omitted tests are functionally identical to one of these tests. 2835 2959 The difference between coroutines and threads can be attributed to the cost of scheduling. … … 2874 2998 \end{cfa} 2875 2999 \end{multicols} 2876 \begin{cfa}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={ lst:ctx-switch}]3000 \begin{cfa}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={f:ctx-switch}] 2877 3001 \end{cfa} 2878 3002 \end{figure} … … 2902 3026 The next interesting benchmark is to measure the overhead to enter/leave a critical-section. 2903 3027 For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. 2904 Listing \ref{lst:mutex} shows the code for \CFA.3028 Figure~\ref{f:mutex} shows the code for \CFA. 2905 3029 To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a @pthread_mutex@ lock is also measured. 2906 3030 The results can be shown in table \ref{tab:mutex}. 2907 3031 2908 3032 \begin{figure} 2909 \begin{cfa}[caption={\CFA benchmark code used to measure mutex routines.},label={ lst:mutex}]3033 \begin{cfa}[caption={\CFA benchmark code used to measure mutex routines.},label={f:mutex}] 2910 3034 monitor M {}; 2911 3035 void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {} … … 2948 3072 \subsection{Internal Scheduling} 2949 3073 The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. 2950 Listing \ref{lst:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}.3074 Figure~\ref{f:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. 2951 3075 As with all other benchmarks, all omitted tests are functionally identical to one of these tests. 2952 3076 2953 3077 \begin{figure} 2954 \begin{cfa}[caption={Benchmark code for internal scheduling},label={ lst:int-sched}]3078 \begin{cfa}[caption={Benchmark code for internal scheduling},label={f:int-sched}] 2955 3079 volatile int go = 0; 2956 3080 condition c; … … 3007 3131 \subsection{External Scheduling} 3008 3132 The Internal scheduling benchmark measures the cost of the @waitfor@ statement (@_Accept@ in \uC). 3009 Listing \ref{lst:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}.3133 Figure~\ref{f:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. 3010 3134 As with all other benchmarks, all omitted tests are functionally identical to one of these tests. 3011 3135 3012 3136 \begin{figure} 3013 \begin{cfa}[caption={Benchmark code for external scheduling},label={ lst:ext-sched}]3137 \begin{cfa}[caption={Benchmark code for external scheduling},label={f:ext-sched}] 3014 3138 volatile int go = 0; 3015 3139 monitor M {}; … … 3061 3185 \end{table} 3062 3186 3187 3063 3188 \subsection{Object Creation} 3064 3189 Finally, the last benchmark measures the cost of creation for concurrent objects. 3065 Listing \ref{lst:creation} shows the code for @pthread@s and \CFA threads, with results shown in table \ref{tab:creation}.3190 Figure~\ref{f:creation} shows the code for @pthread@s and \CFA threads, with results shown in table \ref{tab:creation}. 3066 3191 As with all other benchmarks, all omitted tests are functionally identical to one of these tests. 3067 3192 The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low. … … 3107 3232 \end{center} 3108 3233 \caption{Benchmark code for \protect\lstinline|pthread|s and \CFA to measure object creation} 3109 \label{ lst:creation}3234 \label{f:creation} 3110 3235 \end{figure} 3111 3236 … … 3169 3294 While most of the parallelism tools are aimed at data parallelism and control-flow parallelism, many modern workloads are not bound on computation but on IO operations, a common case being web servers and XaaS (anything as a service). 3170 3295 These types of workloads often require significant engineering around amortizing costs of blocking IO operations. 3171 At its core, non-blocking I/O is an operating system level feature that allows queuing IO operations ( e.g.,network operations) and registering for notifications instead of waiting for requests to complete.3296 At its core, non-blocking I/O is an operating system level feature that allows queuing IO operations (\eg network operations) and registering for notifications instead of waiting for requests to complete. 3172 3297 In this context, the role of the language makes Non-Blocking IO easily available and with low overhead. 3173 3298 The current trend is to use asynchronous programming using tools like callbacks and/or futures and promises, which can be seen in frameworks like Node.js~\cite{NodeJs} for JavaScript, Spring MVC~\cite{SpringMVC} for Java and Django~\cite{Django} for Python. … … 3184 3309 This type of parallelism can be achieved both at the language level and at the library level. 3185 3310 The canonical example of implicit parallelism is parallel for loops, which are the simplest example of a divide and conquer algorithms~\cite{uC++book}. 3186 Table \ref{ lst:parfor} shows three different code examples that accomplish point-wise sums of large arrays.3311 Table \ref{f:parfor} shows three different code examples that accomplish point-wise sums of large arrays. 3187 3312 Note that none of these examples explicitly declare any concurrency or parallelism objects. 3188 3313 … … 3267 3392 \end{center} 3268 3393 \caption{For loop to sum numbers: Sequential, using library parallelism and language parallelism.} 3269 \label{ lst:parfor}3394 \label{f:parfor} 3270 3395 \end{table} 3271 3396
Note:
See TracChangeset
for help on using the changeset viewer.