Changeset c653b37
- Timestamp:
- Jun 28, 2018, 4:04:11 PM (5 years ago)
- Branches:
- ADT, aaron-thesis, arm-eh, ast-experimental, cleanup-dtors, deferred_resn, demangler, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, no_list, persistent-indexer, pthread-emulation, qualifiedEnum
- Children:
- e3b2474
- Parents:
- a12c81f3 (diff), 944ce47 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - Files:
-
- 3 added
- 2 deleted
- 40 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/bibliography/pl.bib
ra12c81f3 rc653b37 1183 1183 that is ``compiled''. 1184 1184 }, 1185 comment = {1186 Imagine the program, including the subroutines, spread out over a1187 table, with the compiler dropping Jello on the parts as they are1188 compiled. At first little drops appear in seemingly random places.1189 These get bigger and combine with other drops to form growing1190 globs. When two globs meet, ripples will go out through each as1191 they adjust to each other's presence, although the parts of the1192 globs that formed first are less affected by the ripples. When1193 compilation is complete, there is one congealed mass.1194 }1195 1185 } 1196 1186 … … 1388 1378 Process-valued expressions and process variables. Processes have 1389 1379 execution priority: Create {\em process-type-name}(args) [with 1390 priority(p)], 1391 and the priority can be changed on the fly. Complicated guard/ 1392 screen structure on accept: accept {\em transaction}(param names) 1380 priority(p)], and the priority can be changed on the fly. Complicated 1381 guard/screen structure on accept: accept {\em transaction}(param names) 1393 1382 [suchthat (exp)] [by (exp)] [compoundstatement]. Accepts cannot 1394 1383 appear in functions! Can specify timeouts on transaction calls. … … 1833 1822 } 1834 1823 1835 @article{Moore75, 1836 keywords = {approximation methods, integrated circuits}, 1837 contributer = {pabuhr@plg}, 1838 author = {Gordon E. Moore}, 1839 title = {Progress in Digital Integrated Electronics}, 1840 journal = {Technical Digest, International Electron Devices Meeting, IEEE}, 1841 year = 1975, 1842 pages = {11-13}, 1824 @misc{CS343, 1825 keywords = {uC++ teaching}, 1826 contributer = {pabuhr@plg}, 1827 key = {Peter Buhr}, 1828 title = {CS343}, 1829 year = 2017, 1830 howpublished= {\href{https://www.student.cs.uwaterloo.ca/~cs343}{https://\-www.student.cs.uwaterloo.ca/\-~cs343}}, 1843 1831 } 1844 1832 … … 2568 2556 year = 1979, 2569 2557 pages = {24-32} 2558 } 2559 2560 @inproceedings{XaaS, 2561 keywords = {Everything as a Service, Anything as a Service, Cloud computing, SOA}, 2562 contributer = {pabuhr@plg}, 2563 author = {Duan, Yucong and Fu, Guohua and Zhou, Nianjun and Sun, Xiaobing and Narendra, Nanjangud C. and Hu, Bo}, 2564 title = {Everything As a Service (XaaS) on the Cloud: Origins, Current and Future Trends}, 2565 booktitle = {Proceedings of the 2015 IEEE 8th International Conference on Cloud Computing}, 2566 series = {CLOUD'15}, 2567 year = {2015}, 2568 pages = {621--628}, 2569 publisher = {IEEE Computer Society}, 2570 address = {Washington, DC, USA}, 2570 2571 } 2571 2572 … … 2779 2780 title = {Extending Modula-2 to Build Large, Integrated Systems}, 2780 2781 journal = {IEEE Software}, 2781 month = nov, year = 1986, 2782 volume = 3, number = 6, pages = {46-57}, 2782 month = nov, 2783 year = 1986, 2784 volume = 3, 2785 number = 6, 2786 pages = {46-57}, 2783 2787 comment = { 2784 2788 Exceptions can have a parameter. Procedures can declare the … … 4893 4897 } 4894 4898 4895 @ techreport{OpenMP,4899 @manual{OpenMP, 4896 4900 keywords = {concurrency, openmp, spmd}, 4897 4901 contributer = {pabuhr@plg}, 4898 author = {OpenMP Architecture Review Board},4899 title = {OpenMP Application Program Interface, Version 4. 0},4900 month = jul,4901 year = 201 3,4902 note = {\href{http ://www.openmp.org/mp-documents/OpenMP4.0.0.pdf}{http://\-www.openmp.org/\-mp-documents/\-OpenMP4.0.0.pdf}},4902 key = {OpenMP}, 4903 title = {OpenMP Application Program Interface, Version 4.5}, 4904 month = nov, 4905 year = 2015, 4906 note = {\href{https://www.openmp.org/wp-content/uploads/openmp-4.5.pdf}{https://\-www.openmp.org/\-wp-content/\-uploads/\-openmp-4.5.pdf}}, 4903 4907 } 4904 4908 … … 5753 5757 } 5754 5758 5759 @article{Moore75, 5760 keywords = {approximation methods, integrated circuits}, 5761 contributer = {pabuhr@plg}, 5762 author = {Gordon E. Moore}, 5763 title = {Progress in Digital Integrated Electronics}, 5764 journal = {Technical Digest, International Electron Devices Meeting, IEEE}, 5765 year = 1975, 5766 pages = {11-13}, 5767 } 5768 5755 5769 @article{promises, 5756 5770 keywords = {futures, Argus, call streams, rpc}, 5757 5771 contributer = {gjditchfield@plg}, 5758 5772 author = {Barbara Liskov and Liuba Shrira}, 5759 title = {Promises: Linguistic Support for Efficient Asynchronous 5760 Procedure Calls in Distributed Systems}, 5773 title = {Promises: Linguistic Support for Efficient Asynchronous Procedure Calls in Distributed Systems}, 5761 5774 journal = sigplan, 5762 5775 year = 1988, -
doc/papers/OOPSLA17/Makefile
ra12c81f3 rc653b37 33 33 34 34 DOCUMENT = generic_types.pdf 35 BASE = ${basename ${DOCUMENT}} 35 36 36 37 # Directives # … … 41 42 42 43 clean : 43 @rm -frv ${DOCUMENT} ${ basename ${DOCUMENT}}.ps ${Build}44 @rm -frv ${DOCUMENT} ${BASE}.ps ${Build} 44 45 45 46 # File Dependencies # 46 47 47 ${DOCUMENT} : ${ basename ${DOCUMENT}}.ps48 ${DOCUMENT} : ${BASE}.ps 48 49 ps2pdf $< 49 50 50 ${ basename ${DOCUMENT}}.ps : ${basename ${DOCUMENT}}.dvi51 ${BASE}.ps : ${BASE}.dvi 51 52 dvips ${Build}/$< -o $@ 52 53 53 ${basename ${DOCUMENT}}.dvi : Makefile ${Build} ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} ../../bibliography/pl.bib 54 ${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 55 ../../bibliography/pl.bib | ${Build} 54 56 # Must have *.aux file containing citations for bibtex 55 57 if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi … … 63 65 ## Define the default recipes. 64 66 65 ${Build} :67 ${Build} : 66 68 mkdir -p ${Build} 67 69 … … 69 71 gnuplot -e Build="'${Build}/'" evaluation/timing.gp 70 72 71 %.tex : %.fig 73 %.tex : %.fig | ${Build} 72 74 fig2dev -L eepic $< > ${Build}/$@ 73 75 74 %.ps : %.fig 76 %.ps : %.fig | ${Build} 75 77 fig2dev -L ps $< > ${Build}/$@ 76 78 77 %.pstex : %.fig 79 %.pstex : %.fig | ${Build} 78 80 fig2dev -L pstex $< > ${Build}/$@ 79 81 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t -
doc/papers/concurrency/Makefile
ra12c81f3 rc653b37 15 15 SOURCES = ${addsuffix .tex, \ 16 16 Paper \ 17 style/style \18 style/cfa-format \19 17 } 20 18 21 19 FIGURES = ${addsuffix .tex, \ 22 monitor \23 ext_monitor \24 20 int_monitor \ 25 21 dependency \ 22 RunTimeStructure \ 26 23 } 27 24 28 25 PICTURES = ${addsuffix .pstex, \ 26 monitor \ 27 ext_monitor \ 29 28 system \ 30 29 monitor_structs \ … … 59 58 dvips ${Build}/$< -o $@ 60 59 61 ${BASE}.dvi : Makefile ${B uild} ${BASE}.out.ps WileyNJD-AMA.bst ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \62 annex/local.bib ../../bibliography/pl.bib 60 ${BASE}.dvi : Makefile ${BASE}.out.ps WileyNJD-AMA.bst ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 61 annex/local.bib ../../bibliography/pl.bib | ${Build} 63 62 # Must have *.aux file containing citations for bibtex 64 63 if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi 65 ${BibTeX} ${Build}/${basename $@}64 -${BibTeX} ${Build}/${basename $@} 66 65 # Some citations reference others so run again to resolve these citations 67 66 ${LaTeX} ${basename $@}.tex 68 ${BibTeX} ${Build}/${basename $@}67 -${BibTeX} ${Build}/${basename $@} 69 68 # Run again to finish citations 70 69 ${LaTeX} ${basename $@}.tex … … 72 71 ## Define the default recipes. 73 72 74 ${Build} :73 ${Build} : 75 74 mkdir -p ${Build} 76 75 77 ${BASE}.out.ps :${Build}76 ${BASE}.out.ps : | ${Build} 78 77 ln -fs ${Build}/Paper.out.ps . 79 78 80 WileyNJD-AMA.bst :79 WileyNJD-AMA.bst : 81 80 ln -fs ../AMA/AMA-stix/ama/WileyNJD-AMA.bst . 82 81 83 %.tex : %.fig ${Build}82 %.tex : %.fig | ${Build} 84 83 fig2dev -L eepic $< > ${Build}/$@ 85 84 86 %.ps : %.fig ${Build}85 %.ps : %.fig | ${Build} 87 86 fig2dev -L ps $< > ${Build}/$@ 88 87 89 %.pstex : %.fig ${Build}88 %.pstex : %.fig | ${Build} 90 89 fig2dev -L pstex $< > ${Build}/$@ 91 90 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t -
doc/papers/concurrency/Paper.tex
ra12c81f3 rc653b37 21 21 \renewcommand{\thesubfigure}{(\Alph{subfigure})} 22 22 \captionsetup{justification=raggedright,singlelinecheck=false} 23 \usepackage{ siunitx}24 \ sisetup{binary-units=true}23 \usepackage{dcolumn} % align decimal points in tables 24 \usepackage{capt-of} 25 25 26 26 \hypersetup{breaklinks=true} … … 235 235 \CFA is a modern, polymorphic, \emph{non-object-oriented} extension of the C programming language. 236 236 This paper discusses the design of the concurrency and parallelism features in \CFA, and the concurrent runtime-system. 237 These features are created from scratch as ISO C lacks concurrency, relying largely on the pthreads library .237 These features are created from scratch as ISO C lacks concurrency, relying largely on the pthreads library for concurrency. 238 238 Coroutines and lightweight (user) threads are introduced into the language. 239 239 In addition, monitors are added as a high-level mechanism for mutual exclusion and synchronization. 240 240 A unique contribution is allowing multiple monitors to be safely acquired simultaneously. 241 241 All features respect the expectations of C programmers, while being fully integrate with the \CFA polymorphic type-system and other language features. 242 Finally, experimental results are presented to compare the performance of the new features with similar mechanisms in other concurrent programming-languages.242 Finally, experimental results show comparable performance of the new features with similar mechanisms in other concurrent programming-languages. 243 243 }% 244 244 … … 257 257 While the simplest concurrency system is a thread and a lock, this low-level approach is hard to master. 258 258 An easier approach for programmers is to support higher-level constructs as the basis of concurrency. 259 Indeed, for highly productive concurrentprogramming, high-level approaches are much more popular~\cite{Hochstein05}.260 Examples of high-level approaches are task (work) based~\cite{TBB}, implicit threading~\cite{OpenMP}, monitors~\cite{Java}, channels~\cite{CSP,Go}, and message passing~\cite{Erlang,MPI}.259 Indeed, for highly-productive concurrent-programming, high-level approaches are much more popular~\cite{Hochstein05}. 260 Examples of high-level approaches are jobs (thread pool)~\cite{TBB}, implicit threading~\cite{OpenMP}, monitors~\cite{Java}, channels~\cite{CSP,Go}, and message passing~\cite{Erlang,MPI}. 261 261 262 262 The following terminology is used. 263 263 A \newterm{thread} is a fundamental unit of execution that runs a sequence of code and requires a stack to maintain state. 264 Multiple simultaneous threads give rise to \newterm{concurrency}, which requires locking to ensure safe communication and access to shared data. 265 % Correspondingly, concurrency is defined as the concepts and challenges that occur when multiple independent (sharing memory, timing dependencies, \etc) concurrent threads are introduced. 264 Multiple simultaneous threads give rise to \newterm{concurrency}, which requires locking to ensure access to shared data and safe communication. 266 265 \newterm{Locking}, and by extension \newterm{locks}, are defined as a mechanism to prevent progress of threads to provide safety. 267 266 \newterm{Parallelism} is running multiple threads simultaneously. 268 267 Parallelism implies \emph{actual} simultaneous execution, where concurrency only requires \emph{apparent} simultaneous execution. 269 268 As such, parallelism only affects performance, which is observed through differences in space and/or time at runtime. 270 271 269 Hence, there are two problems to be solved: concurrency and parallelism. 272 270 While these two concepts are often combined, they are distinct, requiring different tools~\cite[\S~2]{Buhr05a}. 273 Concurrency tools handle synchronization and mutual exclusion, while parallelism tools handle performance, costand resource utilization.271 Concurrency tools handle mutual exclusion and synchronization, while parallelism tools handle performance, cost, and resource utilization. 274 272 275 273 The proposed concurrency API is implemented in a dialect of C, called \CFA. 276 The paper discusses how the language features are added to the \CFA translator with respect to parsing, semantic , and type checking, and the corresponding high-performance runtime-library to implement the concurrencyfeatures.274 The paper discusses how the language features are added to the \CFA translator with respect to parsing, semantics, and type checking, and the corresponding high-performance runtime-library to implement the concurrent features. 277 275 278 276 … … 282 280 Extended versions and explanation of the following code examples are available at the \CFA website~\cite{Cforall} or in Moss~\etal~\cite{Moss18}. 283 281 284 \CFA is a nextension of ISO-C, and hence, supports all C paradigms.282 \CFA is a non-object-oriented extension of ISO-C, and hence, supports all C paradigms. 285 283 %It is a non-object-oriented system-language, meaning most of the major abstractions have either no runtime overhead or can be opted out easily. 286 Like C, the b asics of \CFA revolve aroundstructures and routines.284 Like C, the building blocks of \CFA are structures and routines. 287 285 Virtually all of the code generated by the \CFA translator respects C memory layouts and calling conventions. 288 While \CFA is not an object-oriented language, lacking the concept of a receiver (\eg @this@) and nominal inheritance-relationships, C does have a notion of objects: ``region of data storage in the execution environment, the contents of which can represent values''~\cite[3.15]{C11}.289 While some \CFA features are common in object-oriented programming -languages, they are an independent capability allowing \CFA to adopt them while retaining a procedural paradigm.286 While \CFA is not object oriented, lacking the concept of a receiver (\eg @this@) and nominal inheritance-relationships, C does have a notion of objects: ``region of data storage in the execution environment, the contents of which can represent values''~\cite[3.15]{C11}. 287 While some \CFA features are common in object-oriented programming, they are independent capabilities allowing \CFA to adopt them while maintaining a procedural paradigm. 290 288 291 289 … … 296 294 int x = 1, y = 2, z = 3; 297 295 int * p1 = &x, ** p2 = &p1, *** p3 = &p2, $\C{// pointers to x}$ 298 `&` r1 = x, `&&` r2 = r1,`&&&` r3 = r2; $\C{// references to x}$296 `&` r1 = x, `&&` r2 = r1, `&&&` r3 = r2; $\C{// references to x}$ 299 297 int * p4 = &z, `&` r4 = z; 300 298 … … 411 409 \end{cquote} 412 410 Overloading is important for \CFA concurrency since the runtime system relies on creating different types to represent concurrency objects. 413 Therefore, overloading is necessary to prevent the need for long prefixes and other naming conventions to prevent name clashes. 414 As seen in Section~\ref{basics}, routine @main@ is heavily overloaded. 415 416 Variable overloading is useful in the parallel semantics of the @with@ statement for fields with the same name: 411 Therefore, overloading eliminates long prefixes and other naming conventions to prevent name clashes. 412 As seen in Section~\ref{s:Concurrency}, routine @main@ is heavily overloaded. 413 For example, variable overloading is useful in the parallel semantics of the @with@ statement for fields with the same name: 417 414 \begin{cfa} 418 415 struct S { int `i`; int j; double m; } s; … … 428 425 } 429 426 \end{cfa} 430 For parallel semantics, both @s.i@ and @t.i@ are visible the same type, so only @i@ is ambiguous without qualification.427 For parallel semantics, both @s.i@ and @t.i@ are visible with the same type, so only @i@ is ambiguous without qualification. 431 428 432 429 … … 439 436 \begin{tabular}{@{}ll@{\hspace{\parindentlnth}}|@{\hspace{\parindentlnth}}l@{}} 440 437 \begin{cfa} 441 int ++? 442 int ?++ 443 int `?+?` 438 int ++?(int op); 439 int ?++(int op); 440 int `?+?`(int op1, int op2); 444 441 int ?<=?(int op1, int op2); 445 442 int ?=? (int & op1, int op2); … … 470 467 471 468 469 \subsection{Constructors / Destructors} 470 471 Object lifetime is a challenge in non-managed programming languages. 472 \CFA responds with \CC-like constructors and destructors: 473 \begin{cfa} 474 struct VLA { int len, * data; }; $\C{// variable length array of integers}$ 475 void ?{}( VLA & vla ) with ( vla ) { len = 10; data = alloc( len ); } $\C{// default constructor}$ 476 void ?{}( VLA & vla, int size, char fill ) with ( vla ) { len = size; data = alloc( len, fill ); } // initialization 477 void ?{}( VLA & vla, VLA other ) { vla.len = other.len; vla.data = other.data; } $\C{// copy, shallow}$ 478 void ^?{}( VLA & vla ) with ( vla ) { free( data ); } $\C{// destructor}$ 479 { 480 VLA x, y = { 20, 0x01 }, z = y; $\C{// z points to y}$ 481 // $\LstCommentStyle{\color{red}\ \ \ x\{\};\ \ \ \ \ \ \ \ \ y\{ 20, 0x01 \};\ \ \ \ \ \ \ \ \ \ z\{ z, y \};\ \ \ \ \ \ \ implicit calls}$ 482 ^x{}; $\C{// deallocate x}$ 483 x{}; $\C{// reallocate x}$ 484 z{ 5, 0xff }; $\C{// reallocate z, not pointing to y}$ 485 ^y{}; $\C{// deallocate y}$ 486 y{ x }; $\C{// reallocate y, points to x}$ 487 x{}; $\C{// reallocate x, not pointing to y}$ 488 } // $\LstCommentStyle{\color{red}\^{}z\{\};\ \ \^{}y\{\};\ \ \^{}x\{\};\ \ \ implicit calls}$ 489 \end{cfa} 490 Like \CC, construction is implicit on allocation (stack/heap) and destruction is implicit on deallocation. 491 The object and all their fields are constructed/destructed. 492 \CFA also provides @new@ and @delete@, which behave like @malloc@ and @free@, in addition to constructing and destructing objects: 493 \begin{cfa} 494 { 495 ... struct S s = {10}; ... $\C{// allocation, call constructor}$ 496 } $\C{// deallocation, call destructor}$ 497 struct S * s = new(); $\C{// allocation, call constructor}$ 498 ... 499 delete( s ); $\C{// deallocation, call destructor}$ 500 \end{cfa} 501 \CFA concurrency uses object lifetime as a means of mutual exclusion and/or synchronization. 502 503 472 504 \subsection{Parametric Polymorphism} 473 505 \label{s:ParametricPolymorphism} 474 506 475 The signature feature of \CFA is parametric-polymorphic routines~\cite{ } with routines generalized using a @forall@ clause (giving the language its name), which allow separately compiled routines to support generic usage over multiple types.507 The signature feature of \CFA is parametric-polymorphic routines~\cite{Cforall} with routines generalized using a @forall@ clause (giving the language its name), which allow separately compiled routines to support generic usage over multiple types. 476 508 For example, the following sum routine works for any type that supports construction from 0 and addition: 477 509 \begin{cfa} … … 486 518 int i = sum( sa, 5 ); $\C{// use S's 0 construction and +}$ 487 519 \end{cfa} 520 The builtin type @zero_t@ (and @one_t@) overload constant 0 (and 1) for a new types, where both 0 and 1 have special meaning in C. 488 521 489 522 \CFA provides \newterm{traits} to name a group of type assertions, where the trait name allows specifying the same set of assertions in multiple locations, preventing repetition mistakes at each routine declaration: … … 501 534 502 535 Assertions can be @otype@ or @dtype@. 503 @otype@ refers to a ``complete'' object, \ie an object has a size, default constructor, copy constructor, destructor and an assignment operator.504 @dtype@ only guarantees an objecthas a size and alignment.505 506 Using the return type for discrimination, it is possible to write a type-safe @alloc@ based on the C @malloc@:536 @otype@ refers to a \emph{complete} object, \ie an object has a size, alignment, default constructor, copy constructor, destructor and an assignment operator. 537 @dtype@ refers to an \emph{incomplete} object, \ie, an object only has a size and alignment. 538 539 Using the return type for overload discrimination, it is possible to write a type-safe @alloc@ based on the C @malloc@: 507 540 \begin{cfa} 508 541 forall( dtype T | sized(T) ) T * alloc( void ) { return (T *)malloc( sizeof(T) ); } … … 514 547 515 548 516 \subsection{Constructors / Destructors} 517 518 Object lifetime is a challenge in non-managed programming languages. 519 \CFA responds with \CC-like constructors and destructors: 520 \begin{cfa} 521 struct VLA { int len, * data; }; $\C{// variable length array of integers}$ 522 void ?{}( VLA & vla ) with ( vla ) { len = 10; data = alloc( len ); } $\C{// default constructor}$ 523 void ?{}( VLA & vla, int size, char fill ) with ( vla ) { len = size; data = alloc( len, fill ); } // initialization 524 void ?{}( VLA & vla, VLA other ) { vla.len = other.len; vla.data = other.data; } $\C{// copy, shallow}$ 525 void ^?{}( VLA & vla ) with ( vla ) { free( data ); } $\C{// destructor}$ 526 { 527 VLA x, y = { 20, 0x01 }, z = y; $\C{// z points to y}$ 528 // x{}; y{ 20, 0x01 }; z{ z, y }; 529 ^x{}; $\C{// deallocate x}$ 530 x{}; $\C{// reallocate x}$ 531 z{ 5, 0xff }; $\C{// reallocate z, not pointing to y}$ 532 ^y{}; $\C{// deallocate y}$ 533 y{ x }; $\C{// reallocate y, points to x}$ 534 x{}; $\C{// reallocate x, not pointing to y}$ 535 // ^z{}; ^y{}; ^x{}; 536 } 537 \end{cfa} 538 Like \CC, construction is implicit on allocation (stack/heap) and destruction is implicit on deallocation. 539 The object and all their fields are constructed/destructed. 540 \CFA also provides @new@ and @delete@, which behave like @malloc@ and @free@, in addition to constructing and destructing objects: 541 \begin{cfa} 542 { struct S s = {10}; $\C{// allocation, call constructor}$ 543 ... 544 } $\C{// deallocation, call destructor}$ 545 struct S * s = new(); $\C{// allocation, call constructor}$ 546 ... 547 delete( s ); $\C{// deallocation, call destructor}$ 548 \end{cfa} 549 \CFA concurrency uses object lifetime as a means of synchronization and/or mutual exclusion. 550 551 552 \section{Concurrency Basics}\label{basics} 549 \section{Concurrency} 550 \label{s:Concurrency} 553 551 554 552 At its core, concurrency is based on multiple call-stacks and scheduling threads executing on these stacks. 555 553 Multiple call stacks (or contexts) and a single thread of execution, called \newterm{coroutining}~\cite{Conway63,Marlin80}, does \emph{not} imply concurrency~\cite[\S~2]{Buhr05a}. 556 In coroutining, the single thread is self-scheduling across the stacks, so execution is deterministic, \ie given fixed inputs, the execution path to the outputsis fixed and predictable.554 In coroutining, the single thread is self-scheduling across the stacks, so execution is deterministic, \ie the execution path from input to output is fixed and predictable. 557 555 A \newterm{stackless} coroutine executes on the caller's stack~\cite{Python} but this approach is restrictive, \eg preventing modularization and supporting only iterator/generator-style programming; 558 556 a \newterm{stackfull} coroutine executes on its own stack, allowing full generality. 559 Only stackfull coroutines are a stepping -stone to concurrency.560 561 The transition to concurrency, even for execution with a single thread and multiple stacks, occurs when coroutines also context switch to a scheduling oracle, introducing non-determinism from the coroutine perspective~\cite[\S~3]{Buhr05a}.557 Only stackfull coroutines are a stepping stone to concurrency. 558 559 The transition to concurrency, even for execution with a single thread and multiple stacks, occurs when coroutines also context switch to a \newterm{scheduling oracle}, introducing non-determinism from the coroutine perspective~\cite[\S~3]{Buhr05a}. 562 560 Therefore, a minimal concurrency system is possible using coroutines (see Section \ref{coroutine}) in conjunction with a scheduler to decide where to context switch next. 563 561 The resulting execution system now follows a cooperative threading-model, called \newterm{non-preemptive scheduling}. … … 591 589 \subsection{Coroutines: A Stepping Stone}\label{coroutine} 592 590 593 While the focus of this discussion is concurrency and parallelism, it is important to address coroutines, which are a significant building block of a concurrency system .591 While the focus of this discussion is concurrency and parallelism, it is important to address coroutines, which are a significant building block of a concurrency system (but not concurrent among themselves). 594 592 Coroutines are generalized routines allowing execution to be temporarily suspend and later resumed. 595 593 Hence, unlike a normal routine, a coroutine may not terminate when it returns to its caller, allowing it to be restarted with the values and execution location present at the point of suspension. … … 598 596 Therefore, the core \CFA coroutine-API for has two fundamental features: independent call-stacks and @suspend@/@resume@ operations. 599 597 600 For example, a problem made easier with coroutines is unbounded generators, \eg generating an infinite sequence of Fibonacci numbers , where Figure~\ref{f:C-fibonacci} shows conventional approaches for writing a Fibonacci generator in C.598 For example, a problem made easier with coroutines is unbounded generators, \eg generating an infinite sequence of Fibonacci numbers 601 599 \begin{displaymath} 602 600 \mathsf{fib}(n) = \left \{ … … 608 606 \right. 609 607 \end{displaymath} 610 Figure~\ref{f:GlobalVariables} illustrates the following problems: 611 unique unencapsulated global variables necessary to retain state between calls; 612 only one Fibonacci generator; 613 execution state must be explicitly retained via explicit state variables. 614 Figure~\ref{f:ExternalState} addresses these issues: 615 unencapsulated program global variables become encapsulated structure variables; 616 unique global variables are replaced by multiple Fibonacci objects; 617 explicit execution state is removed by precomputing the first two Fibonacci numbers and returning $\mathsf{fib}(n-2)$. 608 where Figure~\ref{f:C-fibonacci} shows conventional approaches for writing a Fibonacci generator in C. 609 Figure~\ref{f:GlobalVariables} illustrates the following problems: unique unencapsulated global variables necessary to retain state between calls, only one Fibonacci generator, and execution state must be explicitly retained via explicit state variables. 610 Figure~\ref{f:ExternalState} addresses these issues: unencapsulated program global variables become encapsulated structure variables, unique global variables are replaced by multiple Fibonacci objects, and explicit execution state is removed by precomputing the first two Fibonacci numbers and returning $\mathsf{fib}(n-2)$. 618 611 619 612 \begin{figure} … … 723 716 \subfloat[1 State, internal variables]{\label{f:Coroutine1State}\usebox\myboxB} 724 717 \caption{\CFA Coroutine Fibonacci Implementations} 725 \label{f: fibonacci-cfa}718 \label{f:cfa-fibonacci} 726 719 \end{figure} 727 720 728 721 Using a coroutine, it is possible to express the Fibonacci formula directly without any of the C problems. 729 Figure~\ref{f:Coroutine3States} creates a @coroutine@ type: 730 \begin{cfa} 731 `coroutine` Fib { int fn; }; 732 \end{cfa} 733 which provides communication, @fn@, for the \newterm{coroutine main}, @main@, which runs on the coroutine stack, and possibly multiple interface routines @next@. 722 Figure~\ref{f:Coroutine3States} creates a @coroutine@ type, @`coroutine` Fib { int fn; }@, which provides communication, @fn@, for the \newterm{coroutine main}, @main@, which runs on the coroutine stack, and possibly multiple interface routines, \eg @next@. 734 723 Like the structure in Figure~\ref{f:ExternalState}, the coroutine type allows multiple instances, where instances of this type are passed to the (overloaded) coroutine main. 735 The coroutine main's stack holds the state for the next generation, @f1@ and @f2@, and the code has the three suspend points, representing the three states in the Fibonacci formula, to context switch back to the caller's resume.724 The coroutine main's stack holds the state for the next generation, @f1@ and @f2@, and the code has the three suspend points, representing the three states in the Fibonacci formula, to context switch back to the caller's @resume@. 736 725 The interface routine @next@, takes a Fibonacci instance and context switches to it using @resume@; 737 726 on restart, the Fibonacci field, @fn@, contains the next value in the sequence, which is returned. 738 The first @resume@ is special because it cocalls the coroutine at its coroutine main and allocates thestack;727 The first @resume@ is special because it allocates the coroutine stack and cocalls its coroutine main on that stack; 739 728 when the coroutine main returns, its stack is deallocated. 740 729 Hence, @Fib@ is an object at creation, transitions to a coroutine on its first resume, and transitions back to an object when the coroutine main finishes. … … 758 747 \end{quote} 759 748 The example takes advantage of resuming a coroutine in the constructor to prime the loops so the first character sent for formatting appears inside the nested loops. 760 The destruction provides a newline if formatted text ends with a full line.749 The destruction provides a newline, if formatted text ends with a full line. 761 750 Figure~\ref{f:CFmt} shows the C equivalent formatter, where the loops of the coroutine are flatten (linearized) and rechecked on each call because execution location is not retained between calls. 751 (Linearized code is the bane of device drivers.) 762 752 763 753 \begin{figure} … … 843 833 \end{figure} 844 834 845 The previous examples are \newterm{asymmetric (semi) coroutine}s because one coroutine always calls a resuming routine for another coroutine, and the resumed coroutine always suspends back to its last resumer, similar to call/return for normal routines 846 However, there is no stack growth because @resume@/@suspend@ context switch to existing stack-frames rather than create new ones.847 \newterm{Symmetric (full) coroutine}s have a coroutine call a resuming routine for another coroutine, which eventually forms a resuming-call cycle.835 The previous examples are \newterm{asymmetric (semi) coroutine}s because one coroutine always calls a resuming routine for another coroutine, and the resumed coroutine always suspends back to its last resumer, similar to call/return for normal routines. 836 However, @resume@ and @suspend@ context switch among existing stack-frames, rather than create new ones so there is no stack growth. 837 \newterm{Symmetric (full) coroutine}s have a coroutine call to a resuming routine for another coroutine, and its coroutine main has a call to another resuming routine, which eventually forms a resuming-call cycle. 848 838 (The trivial cycle is a coroutine resuming itself.) 849 839 This control flow is similar to recursion for normal routines, but again there is no stack growth from the context switch. … … 927 917 Figure~\ref{f:ProdCons} shows a producer/consumer symmetric-coroutine performing bi-directional communication. 928 918 Since the solution involves a full-coroutining cycle, the program main creates one coroutine in isolation, passes this coroutine to its partner, and closes the cycle at the call to @start@. 929 The @start@ routine communicates both the number of elements to be produced and the consumer into the producer's coroutine 919 The @start@ routine communicates both the number of elements to be produced and the consumer into the producer's coroutine-structure. 930 920 Then the @resume@ to @prod@ creates @prod@'s stack with a frame for @prod@'s coroutine main at the top, and context switches to it. 931 921 @prod@'s coroutine main starts, creates local variables that are retained between coroutine activations, and executes $N$ iterations, each generating two random values, calling the consumer to deliver the values, and printing the status returned from the consumer. … … 933 923 The producer call to @delivery@ transfers values into the consumer's communication variables, resumes the consumer, and returns the consumer status. 934 924 For the first resume, @cons@'s stack is initialized, creating local variables retained between subsequent activations of the coroutine. 935 The consumer iterates until the @done@ flag is set, prints , increments status, and calls back to the producer via @payment@, and on return from @payment@, prints the receipt from the producer and increments @money@ (inflation).936 The call from the consumer to the@payment@ introduces the cycle between producer and consumer.925 The consumer iterates until the @done@ flag is set, prints the values delivered by the producer, increments status, and calls back to the producer via @payment@, and on return from @payment@, prints the receipt from the producer and increments @money@ (inflation). 926 The call from the consumer to @payment@ introduces the cycle between producer and consumer. 937 927 When @payment@ is called, the consumer copies values into the producer's communication variable and a resume is executed. 938 928 The context switch restarts the producer at the point where it was last context switched, so it continues in @delivery@ after the resume. … … 959 949 960 950 Object-oriented inheritance provides extra fields and code in a restricted context, but it requires programmers to explicitly perform the inheritance: 961 \begin{cfa} 962 struct mycoroutine $\textbf{\textsf{inherits}}$baseCoroutine { ... }951 \begin{cfa}[morekeywords={class,inherits}] 952 class mycoroutine inherits baseCoroutine { ... } 963 953 \end{cfa} 964 954 and the programming language (and possibly its tool set, \eg debugger) may need to understand @baseCoroutine@ because of the stack. 965 Furthermore, the execution of constructs/destructors is in the wrong order for certain operations , \eg for threads;966 \eg,if the thread is implicitly started, it must start \emph{after} all constructors, because the thread relies on a completely initialized object, but the inherited constructor runs \emph{before} the derived.955 Furthermore, the execution of constructs/destructors is in the wrong order for certain operations. 956 For example, for threads if the thread is implicitly started, it must start \emph{after} all constructors, because the thread relies on a completely initialized object, but the inherited constructor runs \emph{before} the derived. 967 957 968 958 An alternatively is composition: … … 984 974 symmetric_coroutine<>::yield_type 985 975 \end{cfa} 986 Similarly, the canonical threading paradigm is often based on routine pointers, \eg @pthread @~\cite{pthreads}, \Csharp~\cite{Csharp}, Go~\cite{Go}, and Scala~\cite{Scala}.976 Similarly, the canonical threading paradigm is often based on routine pointers, \eg @pthreads@~\cite{Butenhof97}, \Csharp~\cite{Csharp}, Go~\cite{Go}, and Scala~\cite{Scala}. 987 977 However, the generic thread-handle (identifier) is limited (few operations), unless it is wrapped in a custom type. 988 978 \begin{cfa} … … 1001 991 Note, the type @coroutine_t@ must be an abstract handle to the coroutine, because the coroutine descriptor and its stack are non-copyable. 1002 992 Copying the coroutine descriptor results in copies being out of date with the current state of the stack. 1003 Correspondingly, copying the stack results is copies being out of date with coroutine descriptor, and pointers in the stack being out of date to data on the stack.993 Correspondingly, copying the stack results is copies being out of date with the coroutine descriptor, and pointers in the stack being out of date to data on the stack. 1004 994 (There is no mechanism in C to find all stack-specific pointers and update them as part of a copy.) 1005 995 … … 1015 1005 Furthermore, implementing coroutines without language supports also displays the power of a programming language. 1016 1006 While this is ultimately the option used for idiomatic \CFA code, coroutines and threads can still be constructed without using the language support. 1017 The reserved keyword eases use for the common cases.1007 The reserved keyword simply eases use for the common cases. 1018 1008 1019 1009 Part of the mechanism to generalize coroutines is using a \CFA trait, which defines a coroutine as anything satisfying the trait @is_coroutine@, and this trait is used to restrict coroutine-manipulation routines: … … 1030 1020 The @main@ routine has no return value or additional parameters because the coroutine type allows an arbitrary number of interface routines with corresponding arbitrary typed input/output values versus fixed ones. 1031 1021 The generic routines @suspend@ and @resume@ can be redefined, but any object passed to them is a coroutine since it must satisfy the @is_coroutine@ trait to compile. 1032 The advantage of this approach is that users can easily create different types of coroutines, for example,changing the memory layout of a coroutine is trivial when implementing the @get_coroutine@ routine, and possibly redefining @suspend@ and @resume@.1022 The advantage of this approach is that users can easily create different types of coroutines, \eg changing the memory layout of a coroutine is trivial when implementing the @get_coroutine@ routine, and possibly redefining @suspend@ and @resume@. 1033 1023 The \CFA keyword @coroutine@ implicitly implements the getter and forward declarations required for implementing the coroutine main: 1034 1024 \begin{cquote} … … 1094 1084 \end{tabular} 1095 1085 \end{cquote} 1096 (The qualifier @mutex@ for the destructor parameter is discussed in Section~\ref{s:Monitor s}.)1086 (The qualifier @mutex@ for the destructor parameter is discussed in Section~\ref{s:Monitor}.) 1097 1087 Like a coroutine, the statically-typed @main@ routine is the starting point (first stack frame) of a user thread. 1098 1088 The difference is that a coroutine borrows a thread from its caller, so the first thread resuming a coroutine creates an instance of @main@; 1099 1089 whereas, a user thread receives its own thread from the runtime system, which starts in @main@ as some point after the thread constructor is run.\footnote{ 1100 The \lstinline@main@ routine is already a special routine in C (where the program begins), so it is a natural extension of the semantics to use overloading to declare mains for different coroutines/threads (the normal main being the main of the initial thread).}1090 The \lstinline@main@ routine is already a special routine in C, \ie where the program's initial thread begins, so it is a natural extension of this semantics to use overloading to declare \lstinline@main@s for user coroutines and threads.} 1101 1091 No return value or additional parameters are necessary for this routine because the task type allows an arbitrary number of interface routines with corresponding arbitrary typed input/output values. 1102 1092 … … 1175 1165 The program uses heap-based threads because each thread needs different constructor values. 1176 1166 (Python provides a simple iteration mechanism to initialize array elements to different values allowing stack allocation.) 1177 The allocation/deallocation pattern appears unusual because allocated objects are immediately de leted without any intervening code.1167 The allocation/deallocation pattern appears unusual because allocated objects are immediately deallocated without any intervening code. 1178 1168 However, for threads, the deletion provides implicit synchronization, which is the intervening code. 1179 1169 While the subtotals are added in linear order rather than completion order, which slight inhibits concurrency, the computation is restricted by the critical-path thread (\ie the thread that takes the longest), and so any inhibited concurrency is very small as totalling the subtotals is trivial. … … 1189 1179 void main( Adder & adder ) with( adder ) { 1190 1180 subtotal = 0; 1191 for ( int c = 0; c < cols; c += 1 ) { 1192 subtotal += row[c]; 1193 } 1181 for ( int c = 0; c < cols; c += 1 ) { subtotal += row[c]; } 1194 1182 } 1195 1183 int main() { … … 1216 1204 1217 1205 Uncontrolled non-deterministic execution is meaningless. 1218 To reestablish meaningful execution requires mechanisms to reintroduce determinism (\ie restrict non-determinism), called mutual exclusion and synchronization, where mutual exclusion is an access-control mechanism on data shared by threads, and synchronization is a timing relationship among threads~\cite[\S~4]{Buhr05a}.1206 To reestablish meaningful execution requires mechanisms to reintroduce determinism, \ie restrict non-determinism, called mutual exclusion and synchronization, where mutual exclusion is an access-control mechanism on data shared by threads, and synchronization is a timing relationship among threads~\cite[\S~4]{Buhr05a}. 1219 1207 Since many deterministic challenges appear with the use of mutable shared state, some languages/libraries disallow it, \eg Erlang~\cite{Erlang}, Haskell~\cite{Haskell}, Akka~\cite{Akka} (Scala). 1220 In these paradigms, interaction among concurrent objects is performed by stateless message-passing~\cite{Thoth,Harmony,V-Kernel} or other paradigms closely relate to networking concepts (\eg channels~\cite{CSP,Go}).1221 However, in call/return-based languages, these approaches force a clear distinction (\ie introduce a new programming paradigm) between regular and concurrent computation (\ie routine call versus message passing).1208 In these paradigms, interaction among concurrent objects is performed by stateless message-passing~\cite{Thoth,Harmony,V-Kernel} or other paradigms closely relate to networking concepts, \eg channels~\cite{CSP,Go}. 1209 However, in call/return-based languages, these approaches force a clear distinction, \ie introduce a new programming paradigm between regular and concurrent computation, \eg routine call versus message passing. 1222 1210 Hence, a programmer must learn and manipulate two sets of design patterns. 1223 1211 While this distinction can be hidden away in library code, effective use of the library still has to take both paradigms into account. 1224 1212 In contrast, approaches based on statefull models more closely resemble the standard call/return programming-model, resulting in a single programming paradigm. 1225 1213 1226 At the lowest level, concurrent control is implemented by atomic operations, upon which different kinds of lock s mechanismare constructed, \eg semaphores~\cite{Dijkstra68b}, barriers, and path expressions~\cite{Campbell74}.1214 At the lowest level, concurrent control is implemented by atomic operations, upon which different kinds of locking mechanisms are constructed, \eg semaphores~\cite{Dijkstra68b}, barriers, and path expressions~\cite{Campbell74}. 1227 1215 However, for productivity it is always desirable to use the highest-level construct that provides the necessary efficiency~\cite{Hochstein05}. 1228 1216 A newer approach for restricting non-determinism is transactional memory~\cite{Herlihy93}. … … 1244 1232 However, many solutions exist for mutual exclusion, which vary in terms of performance, flexibility and ease of use. 1245 1233 Methods range from low-level locks, which are fast and flexible but require significant attention for correctness, to higher-level concurrency techniques, which sacrifice some performance to improve ease of use. 1246 Ease of use comes by either guaranteeing some problems cannot occur (\eg deadlock free), or by offering a more explicit coupling between shared data and critical section.1247 For example, the \CC @std::atomic<T>@ offers an easy way to express mutual-exclusion on a restricted set of operations (\eg reading/writing)for numerical types.1234 Ease of use comes by either guaranteeing some problems cannot occur, \eg deadlock free, or by offering a more explicit coupling between shared data and critical section. 1235 For example, the \CC @std::atomic<T>@ offers an easy way to express mutual-exclusion on a restricted set of operations, \eg reading/writing, for numerical types. 1248 1236 However, a significant challenge with locks is composability because it takes careful organization for multiple locks to be used while preventing deadlock. 1249 1237 Easing composability is another feature higher-level mutual-exclusion mechanisms can offer. … … 1254 1242 Synchronization enforces relative ordering of execution, and synchronization tools provide numerous mechanisms to establish these timing relationships. 1255 1243 Low-level synchronization primitives offer good performance and flexibility at the cost of ease of use; 1256 higher-level mechanisms often simplify usage by adding better coupling between synchronization and data (\eg message passing), or offering a simpler solution to otherwise involved challenges, \eg barrier lock.1244 higher-level mechanisms often simplify usage by adding better coupling between synchronization and data, \eg message passing, or offering a simpler solution to otherwise involved challenges, \eg barrier lock. 1257 1245 Often synchronization is used to order access to a critical section, \eg ensuring a reader thread is the next kind of thread to enter a critical section. 1258 1246 If a writer thread is scheduled for next access, but another reader thread acquires the critical section first, that reader has \newterm{barged}. … … 1265 1253 1266 1254 1267 \section{Monitor s}1268 \label{s:Monitor s}1255 \section{Monitor} 1256 \label{s:Monitor} 1269 1257 1270 1258 A \textbf{monitor} is a set of routines that ensure mutual exclusion when accessing shared state. … … 1272 1260 The strong association with the call/return paradigm eases programmability, readability and maintainability, at a slight cost in flexibility and efficiency. 1273 1261 1274 Note, like coroutines/threads, both locks and monitors require an abstract handle to reference them, because at their core, both mechanisms are manipulating non-copyable shared 1262 Note, like coroutines/threads, both locks and monitors require an abstract handle to reference them, because at their core, both mechanisms are manipulating non-copyable shared-state. 1275 1263 Copying a lock is insecure because it is possible to copy an open lock and then use the open copy when the original lock is closed to simultaneously access the shared data. 1276 1264 Copying a monitor is secure because both the lock and shared data are copies, but copying the shared data is meaningless because it no longer represents a unique entity. … … 1326 1314 \end{cfa} 1327 1315 1328 Mandatory monitor qualifiers have the benefit of being self-document ed, but requiring both @mutex@ and \lstinline[morekeywords=nomutex]@nomutex@ for all monitor parameter is redundant.1316 Mandatory monitor qualifiers have the benefit of being self-documenting, but requiring both @mutex@ and \lstinline[morekeywords=nomutex]@nomutex@ for all monitor parameter is redundant. 1329 1317 Instead, one of qualifier semantics can be the default, and the other required. 1330 For example, assume the safe @mutex@ option for a monitor parameterbecause assuming \lstinline[morekeywords=nomutex]@nomutex@ may cause subtle errors.1331 On the other hand, assuming \lstinline[morekeywords=nomutex]@nomutex@ is the \emph{normal} parameter behaviour, stating explicitly ``this parameter is not special''.1318 For example, assume the safe @mutex@ qualifier for all monitor parameters because assuming \lstinline[morekeywords=nomutex]@nomutex@ may cause subtle errors. 1319 On the other hand, assuming \lstinline[morekeywords=nomutex]@nomutex@ is the \emph{normal} parameter behaviour, stating explicitly \emph{this parameter is not special}. 1332 1320 Providing a default qualifier implies knowing whether a parameter is a monitor. 1333 1321 Since \CFA relies heavily on traits as an abstraction mechanism, the distinction between a type that is a monitor and a type that looks like a monitor can become blurred. … … 1375 1363 \end{cfa} 1376 1364 (While object-oriented monitors can be extended with a mutex qualifier for multiple-monitor members, no prior example of this feature could be found.) 1377 In practice, writing multi-locking routines that do not deadlock sis tricky.1365 In practice, writing multi-locking routines that do not deadlock is tricky. 1378 1366 Having language support for such a feature is therefore a significant asset for \CFA. 1379 1367 1380 1368 The capability to acquire multiple locks before entering a critical section is called \newterm{bulk acquire}. 1381 In previous example, \CFA guarantees the order of acquisition is consistent across calls to different routines using the same monitors as arguments.1369 In the previous example, \CFA guarantees the order of acquisition is consistent across calls to different routines using the same monitors as arguments. 1382 1370 This consistent ordering means acquiring multiple monitors is safe from deadlock. 1383 1371 However, users can force the acquiring order. … … 1395 1383 In the calls to @bar@ and @baz@, the monitors are acquired in opposite order. 1396 1384 1397 However, such use leads to lock acquiring order problems resulting in deadlock~\cite{Lister77}, where detecting it requires dynamic ally tracking of monitor calls, and dealing with it requires implementrollback semantics~\cite{Dice10}.1385 However, such use leads to lock acquiring order problems resulting in deadlock~\cite{Lister77}, where detecting it requires dynamic tracking of monitor calls, and dealing with it requires rollback semantics~\cite{Dice10}. 1398 1386 In \CFA, safety is guaranteed by using bulk acquire of all monitors to shared objects, whereas other monitor systems provide no aid. 1399 1387 While \CFA provides only a partial solution, the \CFA partial solution handles many useful cases. … … 1406 1394 } 1407 1395 \end{cfa} 1408 This example shows a trivial solution to the bank-account transfer problem ~\cite{BankTransfer}.1396 This example shows a trivial solution to the bank-account transfer problem. 1409 1397 Without multi- and bulk acquire, the solution to this problem requires careful engineering. 1410 1398 … … 1415 1403 Like Java, \CFA offers an alternative @mutex@ statement to reduce refactoring and naming. 1416 1404 \begin{cquote} 1417 \begin{tabular}{@{}c|@{\hspace{\parindentlnth}}c@{}} 1418 routine call & @mutex@ statement \\ 1405 \begin{tabular}{@{}l@{\hspace{3\parindentlnth}}l@{}} 1419 1406 \begin{cfa} 1420 1407 monitor M {}; … … 1436 1423 1437 1424 \end{cfa} 1425 \\ 1426 \multicolumn{1}{c}{\textbf{routine call}} & \multicolumn{1}{c}{\lstinline@mutex@ \textbf{statement}} 1438 1427 \end{tabular} 1439 1428 \end{cquote} 1440 1429 1441 1430 1442 \section{ InternalScheduling}1443 \label{s: InternalScheduling}1431 \section{Scheduling} 1432 \label{s:Scheduling} 1444 1433 1445 1434 While monitor mutual-exclusion provides safe access to shared data, the monitor data may indicate that a thread accessing it cannot proceed. … … 1450 1439 \newterm{Internal scheduling} is characterized by each thread entering the monitor and making an individual decision about proceeding or blocking, while \newterm{external scheduling} is characterized by an entering thread making a decision about proceeding for itself and on behalf of other threads attempting entry. 1451 1440 1452 Figure~\ref{f:BBInt} shows a \CFA bounded-buffer with internal scheduling, where producers/consumers enter the monitor, see the buffer is full/empty, and block on an appropriate condition lock, @full@/@empty@.1441 Figure~\ref{f:BBInt} shows a \CFA generic bounded-buffer with internal scheduling, where producers/consumers enter the monitor, see the buffer is full/empty, and block on an appropriate condition lock, @full@/@empty@. 1453 1442 The @wait@ routine atomically blocks the calling thread and implicitly releases the monitor lock(s) for all monitors in the routine's parameter list. 1454 1443 The appropriate condition lock is signalled to unblock an opposite kind of thread after an element is inserted/removed from the buffer. 1455 1444 Signalling is unconditional, because signalling an empty condition lock does nothing. 1445 1456 1446 Signalling semantics cannot have the signaller and signalled thread in the monitor simultaneously, which means: 1457 1447 \begin{enumerate} … … 1463 1453 The signalling thread blocks but is marked for urgrent unblocking at the next scheduling point and the signalled thread continues. 1464 1454 \end{enumerate} 1465 The first approach is too restrictive, as it precludes solving a reasonable class of problems (\eg dating service).1455 The first approach is too restrictive, as it precludes solving a reasonable class of problems, \eg dating service (see Figure~\ref{f:DatingService}). 1466 1456 \CFA supports the next two semantics as both are useful. 1467 1457 Finally, while it is common to store a @condition@ as a field of the monitor, in \CFA, a @condition@ variable can be created/stored independently. 1468 Furthermore, a condition variable is tied to a \emph{group} of monitors on first use (called \newterm{branding}), which means that using internal scheduling with distinct sets of monitors requires one condition variable per set of monitors.1458 Furthermore, a condition variable is tied to a \emph{group} of monitors on first use, called \newterm{branding}, which means that using internal scheduling with distinct sets of monitors requires one condition variable per set of monitors. 1469 1459 1470 1460 \begin{figure} … … 1535 1525 \end{figure} 1536 1526 1537 Figure~\ref{f:BBExt} shows a \CFA bounded-buffer with external scheduling, where producers/consumers detecting a full/empty buffer block and prevent more producers/consumers from entering the monitor until the buffer has a free/empty slot.1527 Figure~\ref{f:BBExt} shows a \CFA generic bounded-buffer with external scheduling, where producers/consumers detecting a full/empty buffer block and prevent more producers/consumers from entering the monitor until the buffer has a free/empty slot. 1538 1528 External scheduling is controlled by the @waitfor@ statement, which atomically blocks the calling thread, releases the monitor lock, and restricts the routine calls that can next acquire mutual exclusion. 1539 1529 If the buffer is full, only calls to @remove@ can acquire the buffer, and if the buffer is empty, only calls to @insert@ can acquire the buffer. 1540 Threads making calls to routines that are currently excluded block outside (external) of the monitor on a calling queue, versus blocking on condition queues inside (internal) of the monitor. 1530 Threads making calls to routines that are currently excluded, block outside (external) of the monitor on a calling queue, versus blocking on condition queues inside (internal) of the monitor. 1531 % External scheduling is more constrained and explicit, which helps programmers reduce the non-deterministic nature of concurrency. 1532 External scheduling allows users to wait for events from other threads without concern of unrelated events occurring. 1533 The mechnaism can be done in terms of control flow, \eg Ada @accept@ or \uC @_Accept@, or in terms of data, \eg Go channels. 1534 While both mechanisms have strengths and weaknesses, this project uses a control-flow mechanism to stay consistent with other language semantics. 1535 Two challenges specific to \CFA for external scheduling are loose object-definitions (see Section~\ref{s:LooseObjectDefinitions}) and multiple-monitor routines (see Section~\ref{s:Multi-MonitorScheduling}). 1541 1536 1542 1537 For internal scheduling, non-blocking signalling (as in the producer/consumer example) is used when the signaller is providing the cooperation for a waiting thread; 1543 1538 the signaller enters the monitor and changes state, detects a waiting threads that can use the state, performs a non-blocking signal on the condition queue for the waiting thread, and exits the monitor to run concurrently. 1544 The waiter unblocks next ,takes the state, and exits the monitor.1539 The waiter unblocks next from the urgent queue, uses/takes the state, and exits the monitor. 1545 1540 Blocking signalling is the reverse, where the waiter is providing the cooperation for the signalling thread; 1546 1541 the signaller enters the monitor, detects a waiting thread providing the necessary state, performs a blocking signal to place it on the urgent queue and unblock the waiter. 1547 The waiter changes state and exits the monitor, and the signaller unblocks next from the urgent queue to take the state.1548 1549 Figure~\ref{f:DatingService} shows a dating service demonstrating the two forms of signalling: non-blocking and blocking.1542 The waiter changes state and exits the monitor, and the signaller unblocks next from the urgent queue to use/take the state. 1543 1544 Figure~\ref{f:DatingService} shows a dating service demonstrating non-blocking and blocking signalling. 1550 1545 The dating service matches girl and boy threads with matching compatibility codes so they can exchange phone numbers. 1551 1546 A thread blocks until an appropriate partner arrives. 1552 The complexity is exchanging phone number in the monitor, 1553 While the non-barging monitor prevents a caller from stealing a phone number, the monitor mutual-exclusion property 1554 1555 The dating service is an example of a monitor that cannot be written using external scheduling because: 1556 1557 The example in table \ref{tbl:datingservice} highlights the difference in behaviour. 1558 As mentioned, @signal@ only transfers ownership once the current critical section exits; this behaviour requires additional synchronization when a two-way handshake is needed. 1559 To avoid this explicit synchronization, the @condition@ type offers the @signal_block@ routine, which handles the two-way handshake as shown in the example. 1560 This feature removes the need for a second condition variables and simplifies programming. 1561 Like every other monitor semantic, @signal_block@ uses barging prevention, which means mutual-exclusion is baton-passed both on the front end and the back end of the call to @signal_block@, meaning no other thread can acquire the monitor either before or after the call. 1547 The complexity is exchanging phone numbers in the monitor because of the mutual-exclusion property. 1548 For signal scheduling, the @exchange@ condition is necessary to block the thread finding the match, while the matcher unblocks to take the oppose number, post its phone number, and unblock the partner. 1549 For signal-block scheduling, the implicit urgent-queue replaces the explict @exchange@-condition and @signal_block@ puts the finding thread on the urgent condition and unblocks the matcher. 1550 1551 The dating service is an example of a monitor that cannot be written using external scheduling because it requires knowledge of calling parameters to make scheduling decisions, and parameters of waiting threads are unavailable; 1552 as well, an arriving thread may not find a partner and must wait, which requires a condition variable, and condition variables imply internal scheduling. 1562 1553 1563 1554 \begin{figure} … … 1576 1567 wait( Girls[ccode] ); 1577 1568 GirlPhNo = phNo; 1578 exchange.signal();1569 `exchange.signal();` 1579 1570 } else { 1580 1571 GirlPhNo = phNo; 1581 signal( Boys[ccode] );1582 exchange.wait();1572 `signal( Boys[ccode] );` 1573 `exchange.wait();` 1583 1574 } // if 1584 1575 return BoyPhNo; … … 1606 1597 } else { 1607 1598 GirlPhNo = phNo; // make phone number available 1608 signal_block( Boys[ccode] );// restart boy1599 `signal_block( Boys[ccode] );` // restart boy 1609 1600 1610 1601 } // if … … 1655 1646 } 1656 1647 \end{cfa} 1657 must have acquired monitor locks that are greater than or equal to the number of locks for the waiting thread signalled from the front of the condition queue. 1658 In general, the signaller does not know the order of waiting threads, so in general, it must acquire the maximum number of mutex locks for the worst-case waiting thread. 1648 must have acquired monitor locks that are greater than or equal to the number of locks for the waiting thread signalled from the condition queue. 1659 1649 1660 1650 Similarly, for @waitfor( rtn )@, the default semantics is to atomically block the acceptor and release all acquired mutex types in the parameter list, \ie @waitfor( rtn, m1, m2 )@. 1661 1651 To override the implicit multi-monitor wait, specific mutex parameter(s) can be specified, \eg @waitfor( rtn, m1 )@. 1662 Waitforstatically verifies the released monitors are the same as the acquired mutex-parameters of the given routine or routine pointer.1652 @waitfor@ statically verifies the released monitors are the same as the acquired mutex-parameters of the given routine or routine pointer. 1663 1653 To statically verify the released monitors match with the accepted routine's mutex parameters, the routine (pointer) prototype must be accessible. 1654 1655 When an overloaded routine appears in an @waitfor@ statement, calls to any routine with that name are accepted. 1656 The rationale is that members with the same name should perform a similar function, and therefore, all should be eligible to accept a call. 1657 As always, overloaded routines can be disambiguated using a cast: 1658 \begin{cfa} 1659 void rtn( M & mutex m ); 1660 `int` rtn( M & mutex m ); 1661 waitfor( (`int` (*)( M & mutex ))rtn, m1, m2 ); 1662 \end{cfa} 1664 1663 1665 1664 Given the ability to release a subset of acquired monitors can result in a \newterm{nested monitor}~\cite{Lister77} deadlock. … … 1667 1666 void foo( M & mutex m1, M & mutex m2 ) { 1668 1667 ... wait( `e, m1` ); ... $\C{// release m1, keeping m2 acquired )}$ 1669 void ba z( M & mutex m1, M & mutex m2 ) { $\C{// must acquire m1 and m2 )}$1668 void bar( M & mutex m1, M & mutex m2 ) { $\C{// must acquire m1 and m2 )}$ 1670 1669 ... signal( `e` ); ... 1671 1670 \end{cfa} 1672 The @wait@ only releases @m1@ so the signalling thread cannot acquire both @m1@ and @m2@ to enter @ba z@ to get to the @signal@.1671 The @wait@ only releases @m1@ so the signalling thread cannot acquire both @m1@ and @m2@ to enter @bar@ to get to the @signal@. 1673 1672 While deadlock issues can occur with multiple/nesting acquisition, this issue results from the fact that locks, and by extension monitors, are not perfectly composable. 1674 1673 … … 1682 1681 For example, there are no loops in either bounded buffer solution in Figure~\ref{f:GenericBoundedBuffer}. 1683 1682 Supporting barging prevention as well as extending internal scheduling to multiple monitors is the main source of complexity in the design and implementation of \CFA concurrency. 1683 Furthermore, \CFA concurrency has no superious wakeup~\cite[\S~9]{Buhr05a}, which eliminates an implict form of barging. 1684 1684 1685 1685 … … 1755 1755 However, Figure~\ref{f:OtherWaitingThread} shows this solution is complex depending on other waiters, resulting is choices when the signaller finishes the inner mutex-statement. 1756 1756 The singaller can retain @m2@ until completion of the outer mutex statement and pass the locks to waiter W1, or it can pass @m2@ to waiter W2 after completing the inner mutex-statement, while continuing to hold @m1@. 1757 In the latter case, waiter W2 must eventually pass @m2@ to waiter W1, which is complex because W 2 may have waited before W1 so it is unaware of W1.1757 In the latter case, waiter W2 must eventually pass @m2@ to waiter W1, which is complex because W1 may have waited before W2, so W2 is unaware of it. 1758 1758 Furthermore, there is an execution sequence where the signaller always finds waiter W2, and hence, waiter W1 starves. 1759 1759 1760 1760 While a number of approaches were examined~\cite[\S~4.3]{Delisle18}, the solution chosen for \CFA is a novel techique called \newterm{partial signalling}. 1761 Signalled threads are moved to anurgent queue and the waiter at the front defines the set of monitors necessary for it to unblock.1761 Signalled threads are moved to the urgent queue and the waiter at the front defines the set of monitors necessary for it to unblock. 1762 1762 Partial signalling transfers ownership of monitors to the front waiter. 1763 1763 When the signaller thread exits or waits in the monitor the front waiter is unblocked if all its monitors are released. 1764 1764 This solution has the benefit that complexity is encapsulated into only two actions: passing monitors to the next owner when they should be released and conditionally waking threads if all conditions are met. 1765 1765 1766 \begin{comment} 1767 Figure~\ref{f:dependency} shows a slightly different example where a third thread is waiting on monitor @A@, using a different condition variable. 1768 Because the third thread is signalled when secretly holding @B@, the goal becomes unreachable. 1769 Depending on the order of signals (listing \ref{f:dependency} line \ref{line:signal-ab} and \ref{line:signal-a}) two cases can happen: 1770 1771 \paragraph{Case 1: thread $\alpha$ goes first.} In this case, the problem is that monitor @A@ needs to be passed to thread $\beta$ when thread $\alpha$ is done with it. 1772 \paragraph{Case 2: thread $\beta$ goes first.} In this case, the problem is that monitor @B@ needs to be retained and passed to thread $\alpha$ along with monitor @A@, which can be done directly or possibly using thread $\beta$ as an intermediate. 1773 \\ 1774 1775 Note that ordering is not determined by a race condition but by whether signalled threads are enqueued in FIFO or FILO order. 1776 However, regardless of the answer, users can move line \ref{line:signal-a} before line \ref{line:signal-ab} and get the reverse effect for listing \ref{f:dependency}. 1777 1778 In both cases, the threads need to be able to distinguish, on a per monitor basis, which ones need to be released and which ones need to be transferred, which means knowing when to release a group becomes complex and inefficient (see next section) and therefore effectively precludes this approach. 1779 1780 1781 \subsubsection{Dependency graphs} 1782 1783 \begin{figure} 1784 \begin{multicols}{3} 1785 Thread $\alpha$ 1786 \begin{cfa}[numbers=left, firstnumber=1] 1787 acquire A 1788 acquire A & B 1789 wait A & B 1790 release A & B 1791 release A 1792 \end{cfa} 1793 \columnbreak 1794 Thread $\gamma$ 1795 \begin{cfa}[numbers=left, firstnumber=6, escapechar=|] 1796 acquire A 1797 acquire A & B 1798 |\label{line:signal-ab}|signal A & B 1799 |\label{line:release-ab}|release A & B 1800 |\label{line:signal-a}|signal A 1801 |\label{line:release-a}|release A 1802 \end{cfa} 1803 \columnbreak 1804 Thread $\beta$ 1805 \begin{cfa}[numbers=left, firstnumber=12, escapechar=|] 1806 acquire A 1807 wait A 1808 |\label{line:release-aa}|release A 1809 \end{cfa} 1810 \end{multicols} 1811 \begin{cfa}[caption={Pseudo-code for the three thread example.},label={f:dependency}] 1812 \end{cfa} 1813 \begin{center} 1814 \input{dependency} 1815 \end{center} 1816 \caption{Dependency graph of the statements in listing \ref{f:dependency}} 1817 \label{fig:dependency} 1818 \end{figure} 1819 1820 In listing \ref{f:int-bulk-cfa}, there is a solution that satisfies both barging prevention and mutual exclusion. 1821 If ownership of both monitors is transferred to the waiter when the signaller releases @A & B@ and then the waiter transfers back ownership of @A@ back to the signaller when it releases it, then the problem is solved (@B@ is no longer in use at this point). 1822 Dynamically finding the correct order is therefore the second possible solution. 1823 The problem is effectively resolving a dependency graph of ownership requirements. 1824 Here even the simplest of code snippets requires two transfers and has a super-linear complexity. 1825 This complexity can be seen in listing \ref{f:explosion}, which is just a direct extension to three monitors, requires at least three ownership transfer and has multiple solutions. 1826 Furthermore, the presence of multiple solutions for ownership transfer can cause deadlock problems if a specific solution is not consistently picked; In the same way that multiple lock acquiring order can cause deadlocks. 1827 \begin{figure} 1828 \begin{multicols}{2} 1829 \begin{cfa} 1830 acquire A 1831 acquire B 1832 acquire C 1833 wait A & B & C 1834 release C 1835 release B 1836 release A 1837 \end{cfa} 1838 1839 \columnbreak 1840 1841 \begin{cfa} 1842 acquire A 1843 acquire B 1844 acquire C 1845 signal A & B & C 1846 release C 1847 release B 1848 release A 1849 \end{cfa} 1850 \end{multicols} 1851 \begin{cfa}[caption={Extension to three monitors of listing \ref{f:int-bulk-cfa}},label={f:explosion}] 1852 \end{cfa} 1853 \end{figure} 1854 1855 Given the three threads example in listing \ref{f:dependency}, figure \ref{fig:dependency} shows the corresponding dependency graph that results, where every node is a statement of one of the three threads, and the arrows the dependency of that statement (\eg $\alpha1$ must happen before $\alpha2$). 1856 The extra challenge is that this dependency graph is effectively post-mortem, but the runtime system needs to be able to build and solve these graphs as the dependencies unfold. 1857 Resolving dependency graphs being a complex and expensive endeavour, this solution is not the preferred one. 1858 1859 \subsubsection{Partial Signalling} \label{partial-sig} 1860 \end{comment} 1861 1862 1863 \section{External scheduling} \label{extsched} 1864 1865 An alternative to internal scheduling is external scheduling (see Table~\ref{tbl:sched}). 1866 1867 \begin{comment} 1868 \begin{table} 1869 \begin{tabular}{|c|c|c|} 1870 Internal Scheduling & External Scheduling & Go\\ 1871 \hline 1872 \begin{uC++}[tabsize=3] 1873 _Monitor Semaphore { 1874 condition c; 1875 bool inUse; 1876 public: 1877 void P() { 1878 if(inUse) 1879 wait(c); 1880 inUse = true; 1881 } 1882 void V() { 1883 inUse = false; 1884 signal(c); 1885 } 1886 } 1887 \end{uC++}&\begin{uC++}[tabsize=3] 1888 _Monitor Semaphore { 1889 1890 bool inUse; 1891 public: 1892 void P() { 1893 if(inUse) 1894 _Accept(V); 1895 inUse = true; 1896 } 1897 void V() { 1898 inUse = false; 1899 1900 } 1901 } 1902 \end{uC++}&\begin{Go}[tabsize=3] 1903 type MySem struct { 1904 inUse bool 1905 c chan bool 1906 } 1907 1908 // acquire 1909 func (s MySem) P() { 1910 if s.inUse { 1911 select { 1912 case <-s.c: 1913 } 1914 } 1915 s.inUse = true 1916 } 1917 1918 // release 1919 func (s MySem) V() { 1920 s.inUse = false 1921 1922 // This actually deadlocks 1923 // when single thread 1924 s.c <- false 1925 } 1926 \end{Go} 1927 \end{tabular} 1928 \caption{Different forms of scheduling.} 1929 \label{tbl:sched} 1930 \end{table} 1931 \end{comment} 1932 1933 This method is more constrained and explicit, which helps users reduce the non-deterministic nature of concurrency. 1934 Indeed, as the following examples demonstrate, external scheduling allows users to wait for events from other threads without the concern of unrelated events occurring. 1935 External scheduling can generally be done either in terms of control flow (\eg Ada with @accept@, \uC with @_Accept@) or in terms of data (\eg Go with channels). 1936 Of course, both of these paradigms have their own strengths and weaknesses, but for this project, control-flow semantics was chosen to stay consistent with the rest of the languages semantics. 1937 Two challenges specific to \CFA arise when trying to add external scheduling with loose object definitions and multiple-monitor routines. 1938 The previous example shows a simple use @_Accept@ versus @wait@/@signal@ and its advantages. 1939 Note that while other languages often use @accept@/@select@ as the core external scheduling keyword, \CFA uses @waitfor@ to prevent name collisions with existing socket \textbf{api}s. 1940 1941 For the @P@ member above using internal scheduling, the call to @wait@ only guarantees that @V@ is the last routine to access the monitor, allowing a third routine, say @isInUse()@, acquire mutual exclusion several times while routine @P@ is waiting. 1942 On the other hand, external scheduling guarantees that while routine @P@ is waiting, no other routine than @V@ can acquire the monitor. 1943 1944 % ====================================================================== 1945 % ====================================================================== 1766 1946 1767 \subsection{Loose Object Definitions} 1947 % ====================================================================== 1948 % ====================================================================== 1949 In \uC, a monitor class declaration includes an exhaustive list of monitor operations. 1950 Since \CFA is not object oriented, monitors become both more difficult to implement and less clear for a user: 1951 1952 \begin{cfa} 1953 monitor A {}; 1954 1955 void f(A & mutex a); 1956 void g(A & mutex a) { 1957 waitfor(f); // Obvious which f() to wait for 1958 } 1959 1960 void f(A & mutex a, int); // New different F added in scope 1961 void h(A & mutex a) { 1962 waitfor(f); // Less obvious which f() to wait for 1963 } 1964 \end{cfa} 1965 1966 Furthermore, external scheduling is an example where implementation constraints become visible from the interface. 1967 Here is the cfa-code for the entering phase of a monitor: 1968 \begin{center} 1969 \begin{tabular}{l} 1970 \begin{cfa} 1971 if monitor is free 1972 enter 1973 elif already own the monitor 1974 continue 1975 elif monitor accepts me 1976 enter 1977 else 1978 block 1979 \end{cfa} 1980 \end{tabular} 1981 \end{center} 1768 \label{s:LooseObjectDefinitions} 1769 1770 In an object-oriented programming-language, a class includes an exhaustive list of operations. 1771 However, new members can be added via static inheritance or dynaic members, \eg JavaScript~\cite{JavaScript}. 1772 Similarly, monitor routines can be added at any time in \CFA, making it less clear for programmers and more difficult to implement. 1773 \begin{cfa} 1774 monitor M {}; 1775 void `f`( M & mutex m ); 1776 void g( M & mutex m ) { waitfor( `f` ); } $\C{// clear which f}$ 1777 void `f`( M & mutex m, int ); $\C{// different f}$ 1778 void h( M & mutex m ) { waitfor( `f` ); } $\C{// unclear which f}$ 1779 \end{cfa} 1780 Hence, the cfa-code for the entering a monitor looks like: 1781 \begin{cfa} 1782 if ( $\textrm{\textit{monitor is free}}$ ) $\LstCommentStyle{// \color{red}enter}$ 1783 else if ( $\textrm{\textit{already own monitor}}$ ) $\LstCommentStyle{// \color{red}continue}$ 1784 else if ( $\textrm{\textit{monitor accepts me}}$ ) $\LstCommentStyle{// \color{red}enter}$ 1785 else $\LstCommentStyle{// \color{red}block}$ 1786 \end{cfa} 1982 1787 For the first two conditions, it is easy to implement a check that can evaluate the condition in a few instructions. 1983 However, a fast check for @monitor accepts me@is much harder to implement depending on the constraints put on the monitors.1984 Indeed, monitors are often expressed as an entry queue and some acceptor queue as in Figure~\ref{fig:ClassicalMonitor}.1788 However, a fast check for \emph{monitor accepts me} is much harder to implement depending on the constraints put on the monitors. 1789 Figure~\ref{fig:ClassicalMonitor} shows monitors are often expressed as an entry (calling) queue, some acceptor queues, and an urgent stack/queue. 1985 1790 1986 1791 \begin{figure} 1987 1792 \centering 1988 \subfloat[Classical Monitor] {1793 \subfloat[Classical monitor] { 1989 1794 \label{fig:ClassicalMonitor} 1990 {\resizebox{0.45\textwidth}{!}{\input{monitor }}}1795 {\resizebox{0.45\textwidth}{!}{\input{monitor.pstex_t}}} 1991 1796 }% subfloat 1992 \q quad1993 \subfloat[ bulk acquire Monitor] {1797 \quad 1798 \subfloat[Bulk acquire monitor] { 1994 1799 \label{fig:BulkMonitor} 1995 {\resizebox{0.45\textwidth}{!}{\input{ext_monitor }}}1800 {\resizebox{0.45\textwidth}{!}{\input{ext_monitor.pstex_t}}} 1996 1801 }% subfloat 1997 \caption{External Scheduling Monitor} 1802 \caption{Monitor Implementation} 1803 \label{f:MonitorImplementation} 1998 1804 \end{figure} 1999 1805 2000 There are other alternatives to these pictures, but in the case of the left picture, implementing a fast accept check is relatively easy. 2001 Restricted to a fixed number of mutex members, N, the accept check reduces to updating a bitmask when the acceptor queue changes, a check that executes in a single instruction even with a fairly large number (\eg 128) of mutex members. 2002 This approach requires a unique dense ordering of routines with an upper-bound and that ordering must be consistent across translation units. 2003 For OO languages these constraints are common, since objects only offer adding member routines consistently across translation units via inheritance. 2004 However, in \CFA users can extend objects with mutex routines that are only visible in certain translation unit. 2005 This means that establishing a program-wide dense-ordering among mutex routines can only be done in the program linking phase, and still could have issues when using dynamically shared objects. 2006 2007 The alternative is to alter the implementation as in Figure~\ref{fig:BulkMonitor}. 2008 Here, the mutex routine called is associated with a thread on the entry queue while a list of acceptable routines is kept separate. 2009 Generating a mask dynamically means that the storage for the mask information can vary between calls to @waitfor@, allowing for more flexibility and extensions. 2010 Storing an array of accepted routine pointers replaces the single instruction bitmask comparison with dereferencing a pointer followed by a linear search. 2011 Furthermore, supporting nested external scheduling (\eg listing \ref{f:nest-ext}) may now require additional searches for the @waitfor@ statement to check if a routine is already queued. 1806 For a fixed (small) number of mutex routines (\eg 128), the accept check reduces to a bitmask of allowed callers, which can be checked with a single instruction. 1807 This approach requires a unique dense ordering of routines with a small upper-bound and the ordering must be consistent across translation units. 1808 For object-oriented languages these constraints are common, but \CFA mutex routines can be added in any scope and are only visible in certain translation unit, precluding program-wide dense-ordering among mutex routines. 1809 1810 Figure~\ref{fig:BulkMonitor} shows the \CFA monitor implementation. 1811 The mutex routine called is associated with each thread on the entry queue, while a list of acceptable routines is kept separately. 1812 The accepted list is a variable-sized array of accepted routine pointers, so the single instruction bitmask comparison is replaced by dereferencing a pointer followed by a (usually short) linear search. 1813 1814 1815 \subsection{Multi-Monitor Scheduling} 1816 \label{s:Multi-MonitorScheduling} 1817 1818 External scheduling, like internal scheduling, becomes significantly more complex for multi-monitor semantics. 1819 Even in the simplest case, new semantics needs to be established. 1820 \begin{cfa} 1821 monitor M {}; 1822 void f( M & mutex m1 ); 1823 void g( M & mutex m1, M & mutex m2 ) { 1824 waitfor( f ); $\C{\color{red}// pass m1 or m2 to f?}$ 1825 } 1826 \end{cfa} 1827 The solution is for the programmer to disambiguate: 1828 \begin{cfa} 1829 waitfor( f, m2 ); $\C{\color{red}// wait for call to f with argument m2}$ 1830 \end{cfa} 1831 Routine @g@ has acquired both locks, so when routine @f@ is called, the lock for monitor @m2@ is passed from @g@ to @f@, while @g@ still holds lock @m1@. 1832 This behaviour can be extended to the multi-monitor @waitfor@ statement. 1833 \begin{cfa} 1834 monitor M {}; 1835 void f( M & mutex m1, M & mutex m2 ); 1836 void g( M & mutex m1, M & mutex m2 ) { 1837 waitfor( f, m1, m2 ); $\C{\color{red}// wait for call to f with arguments m1 and m2}$ 1838 } 1839 \end{cfa} 1840 Again, the set of monitors passed to the @waitfor@ statement must be entirely contained in the set of monitors already acquired by the accepting routine. 1841 1842 Note, for internal and external scheduling with multiple monitors, a signalling or accepting thread must match exactly, \ie partial matching results in waiting. 1843 \begin{cquote} 1844 \lstDeleteShortInline@% 1845 \begin{tabular}{@{}l@{\hspace{\parindentlnth}}|@{\hspace{\parindentlnth}}l@{}} 1846 \begin{cfa} 1847 monitor M1 {} m11, m12; 1848 monitor M2 {} m2; 1849 condition c; 1850 void f( M1 & mutex m1, M2 & mutex m2 ) { 1851 signal( c ); 1852 } 1853 void g( M1 & mutex m1, M2 & mutex m2 ) { 1854 wait( c ); 1855 } 1856 g( `m11`, m2 ); // block on accept 1857 f( `m12`, m2 ); // cannot fulfil 1858 \end{cfa} 1859 & 1860 \begin{cfa} 1861 monitor M1 {} m11, m12; 1862 monitor M2 {} m2; 1863 1864 void f( M1 & mutex m1, M2 & mutex m2 ) { 1865 1866 } 1867 void g( M1 & mutex m1, M2 & mutex m2 ) { 1868 waitfor( f, m1, m2 ); 1869 } 1870 g( `m11`, m2 ); // block on accept 1871 f( `m12`, m2 ); // cannot fulfil 1872 \end{cfa} 1873 \end{tabular} 1874 \lstMakeShortInline@% 1875 \end{cquote} 1876 For both internal and external scheduling, the set of monitors is disjoint so unblocking is impossible. 1877 1878 1879 \subsection{Extended \protect\lstinline@waitfor@} 1880 1881 The extended form of the @waitfor@ statement conditionally accepts one of a group of mutex routines and allows a specific action to be performed \emph{after} the mutex routine finishes. 1882 \begin{cfa} 1883 `when` ( $\emph{conditional-expression}$ ) $\C{// optional guard}$ 1884 waitfor( $\emph{mutex-member-name}$ ) 1885 $\emph{statement}$ $\C{// action after call}$ 1886 `or` `when` ( $\emph{conditional-expression}$ ) $\C{// optional guard}$ 1887 waitfor( $\emph{mutex-member-name}$ ) 1888 $\emph{statement}$ $\C{// action after call}$ 1889 `or` ... $\C{// list of waitfor clauses}$ 1890 `when` ( $\emph{conditional-expression}$ ) $\C{// optional guard}$ 1891 `timeout` $\C{// optional terminating timeout clause}$ 1892 $\emph{statement}$ $\C{// action after timeout}$ 1893 `when` ( $\emph{conditional-expression}$ ) $\C{// optional guard}$ 1894 `else` $\C{// optional terminating clause}$ 1895 $\emph{statement}$ $\C{// action when no immediate calls}$ 1896 \end{cfa} 1897 For a @waitfor@ clause to be executed, its @when@ must be true and an outstanding call to its corresponding member(s) must exist. 1898 The \emph{conditional-expression} of a @when@ may call a routine, but the routine must not block or context switch. 1899 If there are several mutex calls that can be accepted, selection occurs top-to-bottom in the @waitfor@ clauses versus non-deterministically. 1900 If some accept guards are true and there are no outstanding calls to these members, the acceptor is accept-blocked until a call to one of these members is made. 1901 If all the accept guards are false, the statement does nothing, unless there is a terminating @else@ clause with a true guard, which is executed instead. 1902 Hence, the terminating @else@ clause allows a conditional attempt to accept a call without blocking. 1903 If there is a @timeout@ clause, it provides an upper bound on waiting, and can only appear with a conditional @else@, otherwise the timeout cannot be triggered. 1904 In all cases, the statement following is executed \emph{after} a clause is executed to know which of the clauses executed. 1905 1906 Note, a group of conditional @waitfor@ clauses is \emph{not} the same as a group of @if@ statements, e.g.: 1907 \begin{cfa} 1908 if ( C1 ) waitfor( mem1 ); when ( C1 ) waitfor( mem1 ); 1909 else if ( C2 ) waitfor( mem2 ); or when ( C2 ) waitfor( mem2 ); 1910 \end{cfa} 1911 The left example accepts only @mem1@ if @C1@ is true or only @mem2@ if @C2@ is true. 1912 The right example accepts either @mem1@ or @mem2@ if @C1@ and @C2@ are true. 1913 1914 An interesting use of @waitfor@ is accepting the @mutex@ destructor to know when an object is deallocated. 1915 \begin{cfa} 1916 void insert( Buffer(T) & mutex buffer, T elem ) with( buffer ) { 1917 if ( count == BufferSize ) 1918 waitfor( remove, buffer ) { 1919 elements[back] = elem; 1920 back = ( back + 1 ) % BufferSize; 1921 count += 1; 1922 } or `waitfor( ^?{}, buffer )` throw insertFail; 1923 } 1924 \end{cfa} 1925 When the buffer is deallocated, the current waiter is unblocked and informed, so it can perform an appropriate action. 1926 However, the basic @waitfor@ semantics do not support this functionality, since using an object after its destructor is called is undefined. 1927 Therefore, to make this useful capability work, the semantics for accepting the destructor is the same as @signal@, \ie the call to the destructor is placed on the urgent queue and the acceptor continues execution, which throws an exception to the acceptor and then the caller is unbloced from the urgent queue to deallocate the object. 1928 Accepting the destructor is an idiomatic way to terminate a thread in \CFA. 1929 1930 1931 \subsection{\protect\lstinline@mutex@ Threads} 1932 1933 Threads in \CFA are monitors to allow direct communication among threads, \ie threads can have mutex routines that are called by other threads. 1934 Hence, all monitor features are available when using threads. 1935 Figure~\ref{f:pingpong} shows an example of two threads directly calling each other and accepting calls from each other in a cycle. 1936 Note, both ping/pong threads are globally declared, @pi@/@po@, and hence, start (and possibly complete) before the program main starts. 2012 1937 2013 1938 \begin{figure} 2014 \begin{cfa}[caption={Example of nested external scheduling},label={f:nest-ext}] 2015 monitor M {}; 2016 void foo( M & mutex a ) {} 2017 void bar( M & mutex b ) { 2018 // Nested in the waitfor(bar, c) call 2019 waitfor(foo, b); 2020 } 2021 void baz( M & mutex c ) { 2022 waitfor(bar, c); 2023 } 2024 2025 \end{cfa} 1939 \lstDeleteShortInline@% 1940 \begin{cquote} 1941 \begin{cfa} 1942 thread Ping {} pi; 1943 thread Pong {} po; 1944 void ping( Ping & mutex ) {} 1945 void pong( Pong & mutex ) {} 1946 int main() {} 1947 \end{cfa} 1948 \begin{tabular}{@{}l@{\hspace{3\parindentlnth}}l@{}} 1949 \begin{cfa} 1950 void main( Ping & pi ) { 1951 for ( int i = 0; i < 10; i += 1 ) { 1952 `waitfor( ping, pi );` 1953 `pong( po );` 1954 } 1955 } 1956 \end{cfa} 1957 & 1958 \begin{cfa} 1959 void main( Pong & po ) { 1960 for ( int i = 0; i < 10; i += 1 ) { 1961 `ping( pi );` 1962 `waitfor( pong, po );` 1963 } 1964 } 1965 \end{cfa} 1966 \end{tabular} 1967 \lstMakeShortInline@% 1968 \end{cquote} 1969 \caption{Threads ping/pong using external scheduling} 1970 \label{f:pingpong} 2026 1971 \end{figure} 2027 1972 2028 Note that in the right picture, tasks need to always keep track of the monitors associated with mutex routines, and the routine mask needs to have both a routine pointer and a set of monitors, as is discussed in the next section. 2029 These details are omitted from the picture for the sake of simplicity. 2030 2031 At this point, a decision must be made between flexibility and performance. 2032 Many design decisions in \CFA achieve both flexibility and performance, for example polymorphic routines add significant flexibility but inlining them means the optimizer can easily remove any runtime cost. 2033 Here, however, the cost of flexibility cannot be trivially removed. 2034 In the end, the most flexible approach has been chosen since it allows users to write programs that would otherwise be hard to write. 2035 This decision is based on the assumption that writing fast but inflexible locks is closer to a solved problem than writing locks that are as flexible as external scheduling in \CFA. 2036 2037 % ====================================================================== 2038 % ====================================================================== 2039 \subsection{Multi-Monitor Scheduling} 2040 % ====================================================================== 2041 % ====================================================================== 2042 2043 External scheduling, like internal scheduling, becomes significantly more complex when introducing multi-monitor syntax. 2044 Even in the simplest possible case, some new semantics needs to be established: 2045 \begin{cfa} 2046 monitor M {}; 2047 2048 void f(M & mutex a); 2049 2050 void g(M & mutex b, M & mutex c) { 2051 waitfor(f); // two monitors M => unknown which to pass to f(M & mutex) 2052 } 2053 \end{cfa} 2054 The obvious solution is to specify the correct monitor as follows: 2055 2056 \begin{cfa} 2057 monitor M {}; 2058 2059 void f(M & mutex a); 2060 2061 void g(M & mutex a, M & mutex b) { 2062 // wait for call to f with argument b 2063 waitfor(f, b); 2064 } 2065 \end{cfa} 2066 This syntax is unambiguous. 2067 Both locks are acquired and kept by @g@. 2068 When routine @f@ is called, the lock for monitor @b@ is temporarily transferred from @g@ to @f@ (while @g@ still holds lock @a@). 2069 This behaviour can be extended to the multi-monitor @waitfor@ statement as follows. 2070 2071 \begin{cfa} 2072 monitor M {}; 2073 2074 void f(M & mutex a, M & mutex b); 2075 2076 void g(M & mutex a, M & mutex b) { 2077 // wait for call to f with arguments a and b 2078 waitfor(f, a, b); 2079 } 2080 \end{cfa} 2081 2082 Note that the set of monitors passed to the @waitfor@ statement must be entirely contained in the set of monitors already acquired in the routine. @waitfor@ used in any other context is undefined behaviour. 2083 2084 An important behaviour to note is when a set of monitors only match partially: 2085 2086 \begin{cfa} 2087 mutex struct A {}; 2088 2089 mutex struct B {}; 2090 2091 void g(A & mutex a, B & mutex b) { 2092 waitfor(f, a, b); 2093 } 2094 2095 A a1, a2; 2096 B b; 2097 2098 void foo() { 2099 g(a1, b); // block on accept 2100 } 2101 2102 void bar() { 2103 f(a2, b); // fulfill cooperation 2104 } 2105 \end{cfa} 2106 While the equivalent can happen when using internal scheduling, the fact that conditions are specific to a set of monitors means that users have to use two different condition variables. 2107 In both cases, partially matching monitor sets does not wakeup the waiting thread. 2108 It is also important to note that in the case of external scheduling the order of parameters is irrelevant; @waitfor(f,a,b)@ and @waitfor(f,b,a)@ are indistinguishable waiting condition. 2109 2110 % ====================================================================== 2111 % ====================================================================== 2112 \subsection{\protect\lstinline|waitfor| Semantics} 2113 % ====================================================================== 2114 % ====================================================================== 2115 2116 Syntactically, the @waitfor@ statement takes a routine identifier and a set of monitors. 2117 While the set of monitors can be any list of expressions, the routine name is more restricted because the compiler validates at compile time the validity of the routine type and the parameters used with the @waitfor@ statement. 2118 It checks that the set of monitors passed in matches the requirements for a routine call. 2119 Figure~\ref{f:waitfor} shows various usages of the waitfor statement and which are acceptable. 2120 The choice of the routine type is made ignoring any non-@mutex@ parameter. 2121 One limitation of the current implementation is that it does not handle overloading, but overloading is possible. 1973 \section{Parallelism} 1974 1975 Historically, computer performance was about processor speeds. 1976 However, with heat dissipation being a direct consequence of speed increase, parallelism has become the new source for increased performance~\cite{Sutter05, Sutter05b}. 1977 Now, high-performance applications must care about parallelism, which requires concurrency. 1978 The lowest-level approach of parallelism is to use \newterm{kernel threads} in combination with semantics like @fork@, @join@, \etc. 1979 However, kernel threads are better as an implementation tool because of complexity and higher cost. 1980 Therefore, different abstractions are often layered onto kernel threads to simplify them, \eg pthreads. 1981 1982 1983 \subsection{User Threads with Preemption} 1984 1985 A direct improvement on kernel threads is user threads, \eg Erlang~\cite{Erlang} and \uC~\cite{uC++book}. 1986 This approach provides an interface that matches the language paradigms, more control over concurrency by the language runtime, and an abstract (and portable) interface to the underlying kernel threads across operating systems. 1987 In many cases, user threads can be used on a much larger scale (100,000 threads). 1988 Like kernel threads, user threads support preemption, which maximizes nondeterminism, but introduces the same concurrency errors: race, livelock, starvation, and deadlock. 1989 \CFA adopts user-threads as they represent the truest realization of concurrency and can build any other concurrency approach, \eg thread pools and actors~\cite{Actors}. 1990 1991 1992 \subsection{User Threads without Preemption (Fiber)} 1993 \label{s:fibers} 1994 1995 A variant of user thread is \newterm{fibers}, which removes preemption, \eg Go~\cite{Go}. 1996 Like functional programming, which removes mutation and its associated problems, removing preemption from concurrency reduces nondeterminism, hence race and deadlock errors are more difficult to generate. 1997 However, preemption is necessary for concurrency that relies on spinning, so there are a class of problems that cannot be programmed without preemption. 1998 1999 2000 \subsection{Thread Pools} 2001 2002 In contrast to direct threading is indirect \newterm{thread pools}, where small jobs (work units) are insert into a work pool for execution. 2003 If the jobs are dependent, \ie interact, there is an implicit/explicit dependency graph that ties them together. 2004 While removing direct concurrency, and hence the amount of context switching, thread pools significantly limit the interaction that can occur among jobs. 2005 Indeed, jobs should not block because that also block the underlying thread, which effectively means the CPU utilization, and therefore throughput, suffers. 2006 While it is possible to tune the thread pool with sufficient threads, it becomes difficult to obtain high throughput and good core utilization as job interaction increases. 2007 As well, concurrency errors return, which threads pools are suppose to mitigate. 2008 Intel's TBB library~\cite{TBB} is the gold standard for thread pools. 2009 2010 2011 \section{\protect\CFA Runtime Structure} 2012 2013 Figure~\ref{f:RunTimeStructure} illustrates the runtime structure of a \CFA program. 2014 In addition to the new kinds of objects introduced by \CFA, there are two more runtime entities used to control parallel execution: cluster and (virtual) processor. 2015 An executing thread is illustrated by its containment in a processor. 2016 2122 2017 \begin{figure} 2123 \begin{cfa}[caption={Various correct and incorrect uses of the waitfor statement},label={f:waitfor}] 2124 monitor A{}; 2125 monitor B{}; 2126 2127 void f1( A & mutex ); 2128 void f2( A & mutex, B & mutex ); 2129 void f3( A & mutex, int ); 2130 void f4( A & mutex, int ); 2131 void f4( A & mutex, double ); 2132 2133 void foo( A & mutex a1, A & mutex a2, B & mutex b1, B & b2 ) { 2134 A * ap = & a1; 2135 void (*fp)( A & mutex ) = f1; 2136 2137 waitfor(f1, a1); // Correct : 1 monitor case 2138 waitfor(f2, a1, b1); // Correct : 2 monitor case 2139 waitfor(f3, a1); // Correct : non-mutex arguments are ignored 2140 waitfor(f1, *ap); // Correct : expression as argument 2141 2142 waitfor(f1, a1, b1); // Incorrect : Too many mutex arguments 2143 waitfor(f2, a1); // Incorrect : Too few mutex arguments 2144 waitfor(f2, a1, a2); // Incorrect : Mutex arguments don't match 2145 waitfor(f1, 1); // Incorrect : 1 not a mutex argument 2146 waitfor(f9, a1); // Incorrect : f9 routine does not exist 2147 waitfor(*fp, a1 ); // Incorrect : fp not an identifier 2148 waitfor(f4, a1); // Incorrect : f4 ambiguous 2149 2150 waitfor(f2, a1, b2); // Undefined behaviour : b2 not mutex 2151 } 2152 \end{cfa} 2018 \centering 2019 \input{RunTimeStructure} 2020 \caption{\CFA Runtime Structure} 2021 \label{f:RunTimeStructure} 2153 2022 \end{figure} 2154 2023 2155 Finally, for added flexibility, \CFA supports constructing a complex @waitfor@ statement using the @or@, @timeout@ and @else@. 2156 Indeed, multiple @waitfor@ clauses can be chained together using @or@; this chain forms a single statement that uses baton pass to any routine that fits one of the routine+monitor set passed in. 2157 To enable users to tell which accepted routine executed, @waitfor@s are followed by a statement (including the null statement @;@) or a compound statement, which is executed after the clause is triggered. 2158 A @waitfor@ chain can also be followed by a @timeout@, to signify an upper bound on the wait, or an @else@, to signify that the call should be non-blocking, which checks for a matching routine call already arrived and otherwise continues. 2159 Any and all of these clauses can be preceded by a @when@ condition to dynamically toggle the accept clauses on or off based on some current state. 2160 Figure~\ref{f:waitfor2} demonstrates several complex masks and some incorrect ones. 2161 2162 \begin{figure} 2163 \lstset{language=CFA,deletedelim=**[is][]{`}{`}} 2164 \begin{cfa} 2165 monitor A{}; 2166 2167 void f1( A & mutex ); 2168 void f2( A & mutex ); 2169 2170 void foo( A & mutex a, bool b, int t ) { 2171 waitfor(f1, a); $\C{// Correct : blocking case}$ 2172 2173 waitfor(f1, a) { $\C{// Correct : block with statement}$ 2174 sout | "f1" | endl; 2175 } 2176 waitfor(f1, a) { $\C{// Correct : block waiting for f1 or f2}$ 2177 sout | "f1" | endl; 2178 } or waitfor(f2, a) { 2179 sout | "f2" | endl; 2180 } 2181 waitfor(f1, a); or else; $\C{// Correct : non-blocking case}$ 2182 2183 waitfor(f1, a) { $\C{// Correct : non-blocking case}$ 2184 sout | "blocked" | endl; 2185 } or else { 2186 sout | "didn't block" | endl; 2187 } 2188 waitfor(f1, a) { $\C{// Correct : block at most 10 seconds}$ 2189 sout | "blocked" | endl; 2190 } or timeout( 10`s) { 2191 sout | "didn't block" | endl; 2192 } 2193 // Correct : block only if b == true if b == false, don't even make the call 2194 when(b) waitfor(f1, a); 2195 2196 // Correct : block only if b == true if b == false, make non-blocking call 2197 waitfor(f1, a); or when(!b) else; 2198 2199 // Correct : block only of t > 1 2200 waitfor(f1, a); or when(t > 1) timeout(t); or else; 2201 2202 // Incorrect : timeout clause is dead code 2203 waitfor(f1, a); or timeout(t); or else; 2204 2205 // Incorrect : order must be waitfor [or waitfor... [or timeout] [or else]] 2206 timeout(t); or waitfor(f1, a); or else; 2207 } 2208 \end{cfa} 2209 \caption{Correct and incorrect uses of the or, else, and timeout clause around a waitfor statement} 2210 \label{f:waitfor2} 2211 \end{figure} 2212 2213 % ====================================================================== 2214 % ====================================================================== 2215 \subsection{Waiting For The Destructor} 2216 % ====================================================================== 2217 % ====================================================================== 2218 An interesting use for the @waitfor@ statement is destructor semantics. 2219 Indeed, the @waitfor@ statement can accept any @mutex@ routine, which includes the destructor (see section \ref{data}). 2220 However, with the semantics discussed until now, waiting for the destructor does not make any sense, since using an object after its destructor is called is undefined behaviour. 2221 The simplest approach is to disallow @waitfor@ on a destructor. 2222 However, a more expressive approach is to flip ordering of execution when waiting for the destructor, meaning that waiting for the destructor allows the destructor to run after the current @mutex@ routine, similarly to how a condition is signalled. 2223 \begin{figure} 2224 \begin{cfa}[caption={Example of an executor which executes action in series until the destructor is called.},label={f:dtor-order}] 2225 monitor Executer {}; 2226 struct Action; 2227 2228 void ^?{} (Executer & mutex this); 2229 void execute(Executer & mutex this, const Action & ); 2230 void run (Executer & mutex this) { 2231 while(true) { 2232 waitfor(execute, this); 2233 or waitfor(^?{} , this) { 2234 break; 2235 } 2236 } 2237 } 2238 \end{cfa} 2239 \end{figure} 2240 For example, listing \ref{f:dtor-order} shows an example of an executor with an infinite loop, which waits for the destructor to break out of this loop. 2241 Switching the semantic meaning introduces an idiomatic way to terminate a task and/or wait for its termination via destruction. 2242 2243 2244 % ###### # ###### # # # ####### # ### ##### # # 2245 % # # # # # # # # # # # # # # # ## ## 2246 % # # # # # # # # # # # # # # # # # # 2247 % ###### # # ###### # # # # ##### # # ##### # # # 2248 % # ####### # # ####### # # # # # # # # 2249 % # # # # # # # # # # # # # # # # 2250 % # # # # # # # ####### ####### ####### ####### ### ##### # # 2251 \section{Parallelism} 2252 Historically, computer performance was about processor speeds and instruction counts. 2253 However, with heat dissipation being a direct consequence of speed increase, parallelism has become the new source for increased performance~\cite{Sutter05, Sutter05b}. 2254 In this decade, it is no longer reasonable to create a high-performance application without caring about parallelism. 2255 Indeed, parallelism is an important aspect of performance and more specifically throughput and hardware utilization. 2256 The lowest-level approach of parallelism is to use \textbf{kthread} in combination with semantics like @fork@, @join@, \etc. 2257 However, since these have significant costs and limitations, \textbf{kthread} are now mostly used as an implementation tool rather than a user oriented one. 2258 There are several alternatives to solve these issues that all have strengths and weaknesses. 2259 While there are many variations of the presented paradigms, most of these variations do not actually change the guarantees or the semantics, they simply move costs in order to achieve better performance for certain workloads. 2260 2261 \section{Paradigms} 2262 \subsection{User-Level Threads} 2263 A direct improvement on the \textbf{kthread} approach is to use \textbf{uthread}. 2264 These threads offer most of the same features that the operating system already provides but can be used on a much larger scale. 2265 This approach is the most powerful solution as it allows all the features of multithreading, while removing several of the more expensive costs of kernel threads. 2266 The downside is that almost none of the low-level threading problems are hidden; users still have to think about data races, deadlocks and synchronization issues. 2267 These issues can be somewhat alleviated by a concurrency toolkit with strong guarantees, but the parallelism toolkit offers very little to reduce complexity in itself. 2268 2269 Examples of languages that support \textbf{uthread} are Erlang~\cite{Erlang} and \uC~\cite{uC++book}. 2270 2271 \subsection{Fibers : User-Level Threads Without Preemption} \label{fibers} 2272 A popular variant of \textbf{uthread} is what is often referred to as \textbf{fiber}. 2273 However, \textbf{fiber} do not present meaningful semantic differences with \textbf{uthread}. 2274 The significant difference between \textbf{uthread} and \textbf{fiber} is the lack of \textbf{preemption} in the latter. 2275 Advocates of \textbf{fiber} list their high performance and ease of implementation as major strengths, but the performance difference between \textbf{uthread} and \textbf{fiber} is controversial, and the ease of implementation, while true, is a weak argument in the context of language design. 2276 Therefore this proposal largely ignores fibers. 2277 2278 An example of a language that uses fibers is Go~\cite{Go} 2279 2280 \subsection{Jobs and Thread Pools} 2281 An approach on the opposite end of the spectrum is to base parallelism on \textbf{pool}. 2282 Indeed, \textbf{pool} offer limited flexibility but at the benefit of a simpler user interface. 2283 In \textbf{pool} based systems, users express parallelism as units of work, called jobs, and a dependency graph (either explicit or implicit) that ties them together. 2284 This approach means users need not worry about concurrency but significantly limit the interaction that can occur among jobs. 2285 Indeed, any \textbf{job} that blocks also block the underlying worker, which effectively means the CPU utilization, and therefore throughput, suffers noticeably. 2286 It can be argued that a solution to this problem is to use more workers than available cores. 2287 However, unless the number of jobs and the number of workers are comparable, having a significant number of blocked jobs always results in idles cores. 2288 2289 The gold standard of this implementation is Intel's TBB library~\cite{TBB}. 2290 2291 \subsection{Paradigm Performance} 2292 While the choice between the three paradigms listed above may have significant performance implications, it is difficult to pin down the performance implications of choosing a model at the language level. 2293 Indeed, in many situations one of these paradigms may show better performance but it all strongly depends on the workload. 2294 Having a large amount of mostly independent units of work to execute almost guarantees equivalent performance across paradigms and that the \textbf{pool}-based system has the best efficiency thanks to the lower memory overhead (\ie no thread stack per job). 2295 However, interactions among jobs can easily exacerbate contention. 2296 User-level threads allow fine-grain context switching, which results in better resource utilization, but a context switch is more expensive and the extra control means users need to tweak more variables to get the desired performance. 2297 Finally, if the units of uninterrupted work are large, enough the paradigm choice is largely amortized by the actual work done. 2298 2299 \section{The \protect\CFA\ Kernel : Processors, Clusters and Threads}\label{kernel} 2300 A \textbf{cfacluster} is a group of \textbf{kthread} executed in isolation. \textbf{uthread} are scheduled on the \textbf{kthread} of a given \textbf{cfacluster}, allowing organization between \textbf{uthread} and \textbf{kthread}. 2301 It is important that \textbf{kthread} belonging to a same \textbf{cfacluster} have homogeneous settings, otherwise migrating a \textbf{uthread} from one \textbf{kthread} to the other can cause issues. 2302 A \textbf{cfacluster} also offers a pluggable scheduler that can optimize the workload generated by the \textbf{uthread}. 2303 2304 \textbf{cfacluster} have not been fully implemented in the context of this paper. 2305 Currently \CFA only supports one \textbf{cfacluster}, the initial one. 2306 2307 \subsection{Future Work: Machine Setup}\label{machine} 2308 While this was not done in the context of this paper, another important aspect of clusters is affinity. 2309 While many common desktop and laptop PCs have homogeneous CPUs, other devices often have more heterogeneous setups. 2310 For example, a system using \textbf{numa} configurations may benefit from users being able to tie clusters and/or kernel threads to certain CPU cores. 2311 OS support for CPU affinity is now common~\cite{affinityLinux, affinityWindows, affinityFreebsd, affinityNetbsd, affinityMacosx}, which means it is both possible and desirable for \CFA to offer an abstraction mechanism for portable CPU affinity. 2312 2313 \subsection{Paradigms}\label{cfaparadigms} 2314 Given these building blocks, it is possible to reproduce all three of the popular paradigms. 2315 Indeed, \textbf{uthread} is the default paradigm in \CFA. 2316 However, disabling \textbf{preemption} on a cluster means threads effectively become fibers. 2317 Since several \textbf{cfacluster} with different scheduling policy can coexist in the same application, this allows \textbf{fiber} and \textbf{uthread} to coexist in the runtime of an application. 2318 Finally, it is possible to build executors for thread pools from \textbf{uthread} or \textbf{fiber}, which includes specialized jobs like actors~\cite{Actors}. 2319 2320 2321 2322 \section{Behind the Scenes} 2323 There are several challenges specific to \CFA when implementing concurrency. 2324 These challenges are a direct result of bulk acquire and loose object definitions. 2325 These two constraints are the root cause of most design decisions in the implementation. 2326 Furthermore, to avoid contention from dynamically allocating memory in a concurrent environment, the internal-scheduling design is (almost) entirely free of mallocs. 2327 This approach avoids the chicken and egg problem~\cite{Chicken} of having a memory allocator that relies on the threading system and a threading system that relies on the runtime. 2328 This extra goal means that memory management is a constant concern in the design of the system. 2329 2330 The main memory concern for concurrency is queues. 2331 All blocking operations are made by parking threads onto queues and all queues are designed with intrusive nodes, where each node has pre-allocated link fields for chaining, to avoid the need for memory allocation. 2332 Since several concurrency operations can use an unbound amount of memory (depending on bulk acquire), statically defining information in the intrusive fields of threads is insufficient.The only way to use a variable amount of memory without requiring memory allocation is to pre-allocate large buffers of memory eagerly and store the information in these buffers. 2333 Conveniently, the call stack fits that description and is easy to use, which is why it is used heavily in the implementation of internal scheduling, particularly variable-length arrays. 2334 Since stack allocation is based on scopes, the first step of the implementation is to identify the scopes that are available to store the information, and which of these can have a variable-length array. 2335 The threads and the condition both have a fixed amount of memory, while @mutex@ routines and blocking calls allow for an unbound amount, within the stack size. 2336 2337 Note that since the major contributions of this paper are extending monitor semantics to bulk acquire and loose object definitions, any challenges that are not resulting of these characteristics of \CFA are considered as solved problems and therefore not discussed. 2338 2339 % ====================================================================== 2340 % ====================================================================== 2341 \section{Mutex Routines} 2342 % ====================================================================== 2343 % ====================================================================== 2344 2345 The first step towards the monitor implementation is simple @mutex@ routines. 2346 In the single monitor case, mutual-exclusion is done using the entry/exit procedure in listing \ref{f:entry1}. 2347 The entry/exit procedures do not have to be extended to support multiple monitors. 2348 Indeed it is sufficient to enter/leave monitors one-by-one as long as the order is correct to prevent deadlock~\cite{Havender68}. 2349 In \CFA, ordering of monitor acquisition relies on memory ordering. 2350 This approach is sufficient because all objects are guaranteed to have distinct non-overlapping memory layouts and mutual-exclusion for a monitor is only defined for its lifetime, meaning that destroying a monitor while it is acquired is undefined behaviour. 2351 When a mutex call is made, the concerned monitors are aggregated into a variable-length pointer array and sorted based on pointer values. 2024 2025 \subsection{Cluster} 2026 \label{s:RuntimeStructureCluster} 2027 2028 A \newterm{cluster} is a collection of threads and virtual processors (abstract kernel-thread) that execute the threads (like a virtual machine). 2029 The purpose of a cluster is to control the amount of parallelism that is possible among threads, plus scheduling and other execution defaults. 2030 The default cluster-scheduler is single-queue multi-server, which provides automatic load-balancing of threads on processors. 2031 However, the scheduler is pluggable, supporting alternative schedulers. 2032 If several clusters exist, both threads and virtual processors, can be explicitly migrated from one cluster to another. 2033 No automatic load balancing among clusters is performed by \CFA. 2034 2035 When a \CFA program begins execution, it creates a user cluster with a single processor and a special processor to handle preemption that does not execute user threads. 2036 The user cluster is created to contain the application user-threads. 2037 Having all threads execute on the one cluster often maximizes utilization of processors, which minimizes runtime. 2038 However, because of limitations of the underlying operating system, heterogeneous hardware, or scheduling requirements (real-time), it is sometimes necessary to have multiple clusters. 2039 2040 2041 \subsection{Virtual Processor} 2042 \label{s:RuntimeStructureProcessor} 2043 2044 A virtual processor is implemented by a kernel thread (\eg UNIX process), which is subsequently scheduled for execution on a hardware processor by the underlying operating system. 2045 Programs may use more virtual processors than hardware processors. 2046 On a multiprocessor, kernel threads are distributed across the hardware processors resulting in virtual processors executing in parallel. 2047 (It is possible to use affinity to lock a virtual processor onto a particular hardware processor~\cite{affinityLinux, affinityWindows, affinityFreebsd, affinityNetbsd, affinityMacosx}, which is used when caching issues occur or for heterogeneous hardware processors.) 2048 The \CFA runtime attempts to block unused processors and unblock processors as the system load increases; 2049 balancing the workload with processors is difficult. 2050 Preemption occurs on virtual processors rather than user threads, via operating-system interrupts. 2051 Thus virtual processors execute user threads, where preemption frequency applies to a virtual processor, so preemption occurs randomly across the executed user threads. 2052 Turning off preemption transforms user threads into fibers. 2053 2054 2055 \subsection{Debug Kernel} 2056 2057 There are two versions of the \CFA runtime kernel: debug and non-debug. 2058 The debugging version has many runtime checks and internal assertions, \eg stack (non-writable) guard page, and checks for stack overflow whenever context switches occur among coroutines and threads, which catches most stack overflows. 2059 After a program is debugged, the non-debugging version can be used to decrease space and increase performance. 2060 2061 2062 \section{Implementation} 2063 2064 Currently, \CFA has fixed-sized stacks, where the stack size can be set at coroutine/thread creation but with no subsequent growth. 2065 Schemes exist for dynamic stack-growth, such as stack copying and chained stacks. 2066 However, stack copying requires pointer adjustment to items on the stack, which is impossible without some form of garage collection. 2067 As well, chained stacks require all modules be recompiled to use this feature, which breaks backward compatibility with existing C libraries. 2068 In the long term, it is likely C libraries will migrate to stack chaining to support concurrency, at only a minimal cost to sequential programs. 2069 Nevertheless, experience teaching \uC~\cite{CS343} shows fixed-sized stacks are rarely an issue in most concurrent programs. 2070 2071 A primary implementation challenge is avoiding contention from dynamically allocating memory because of bulk acquire, \eg the internal-scheduling design is (almost) free of allocations. 2072 All blocking operations are made by parking threads onto queues, therefore all queues are designed with intrusive nodes, where each node has preallocated link fields for chaining. 2073 Furthermore, several bulk-acquire operations need a variable amount of memory. 2074 This storage is allocated at the base of a thread's stack before blocking, which means programmers must add a small amount of extra space for stacks. 2075 2076 In \CFA, ordering of monitor acquisition relies on memory ordering to prevent deadlock~\cite{Havender68}, because all objects are guaranteed to have distinct non-overlapping memory layouts, and mutual-exclusion for a monitor is only defined for its lifetime. 2077 When a mutex call is made, pointers to the concerned monitors are aggregated into a variable-length array and sorted. 2352 2078 This array persists for the entire duration of the mutual-exclusion and its ordering reused extensively. 2353 \begin{figure} 2354 \begin{multicols}{2} 2355 Entry 2356 \begin{cfa} 2357 if monitor is free 2358 enter 2359 elif already own the monitor 2360 continue 2361 else 2362 block 2363 increment recursions 2364 \end{cfa} 2365 \columnbreak 2366 Exit 2367 \begin{cfa} 2368 decrement recursion 2369 if recursion == 0 2370 if entry queue not empty 2371 wake-up thread 2372 \end{cfa} 2373 \end{multicols} 2374 \begin{cfa}[caption={Initial entry and exit routine for monitors},label={f:entry1}] 2375 \end{cfa} 2376 \end{figure} 2377 2378 \subsection{Details: Interaction with polymorphism} 2379 Depending on the choice of semantics for when monitor locks are acquired, interaction between monitors and \CFA's concept of polymorphism can be more complex to support. 2380 However, it is shown that entry-point locking solves most of the issues. 2381 2382 First of all, interaction between @otype@ polymorphism (see Section~\ref{s:ParametricPolymorphism}) and monitors is impossible since monitors do not support copying. 2383 Therefore, the main question is how to support @dtype@ polymorphism. 2384 It is important to present the difference between the two acquiring options: \textbf{callsite-locking} and entry-point locking, \ie acquiring the monitors before making a mutex routine-call or as the first operation of the mutex routine-call. 2385 For example: 2386 \begin{table} 2387 \begin{center} 2388 \begin{tabular}{|c|c|c|} 2389 Mutex & \textbf{callsite-locking} & \textbf{entry-point-locking} \\ 2390 call & cfa-code & cfa-code \\ 2391 \hline 2392 \begin{cfa}[tabsize=3] 2393 void foo(monitor& mutex a){ 2394 2395 // Do Work 2396 //... 2397 2398 } 2399 2400 void main() { 2401 monitor a; 2402 2403 foo(a); 2404 2405 } 2406 \end{cfa} & \begin{cfa}[tabsize=3] 2407 foo(& a) { 2408 2409 // Do Work 2410 //... 2411 2412 } 2413 2414 main() { 2415 monitor a; 2416 acquire(a); 2417 foo(a); 2418 release(a); 2419 } 2420 \end{cfa} & \begin{cfa}[tabsize=3] 2421 foo(& a) { 2422 acquire(a); 2423 // Do Work 2424 //... 2425 release(a); 2426 } 2427 2428 main() { 2429 monitor a; 2430 2431 foo(a); 2432 2433 } 2434 \end{cfa} 2435 \end{tabular} 2436 \end{center} 2437 \caption{Call-site vs entry-point locking for mutex calls} 2438 \label{tbl:locking-site} 2439 \end{table} 2440 2441 Note the @mutex@ keyword relies on the type system, which means that in cases where a generic monitor-routine is desired, writing the mutex routine is possible with the proper trait, \eg: 2442 \begin{cfa} 2443 // Incorrect: T may not be monitor 2444 forall(dtype T) 2445 void foo(T * mutex t); 2446 2447 // Correct: this routine only works on monitors (any monitor) 2448 forall(dtype T | is_monitor(T)) 2449 void bar(T * mutex t)); 2450 \end{cfa} 2451 2452 Both entry point and \textbf{callsite-locking} are feasible implementations. 2453 The current \CFA implementation uses entry-point locking because it requires less work when using \textbf{raii}, effectively transferring the burden of implementation to object construction/destruction. 2454 It is harder to use \textbf{raii} for call-site locking, as it does not necessarily have an existing scope that matches exactly the scope of the mutual exclusion, \ie the routine body. 2455 For example, the monitor call can appear in the middle of an expression. 2456 Furthermore, entry-point locking requires less code generation since any useful routine is called multiple times but there is only one entry point for many call sites. 2457 2458 % ====================================================================== 2459 % ====================================================================== 2460 \section{Threading} \label{impl:thread} 2461 % ====================================================================== 2462 % ====================================================================== 2463 2464 Figure \ref{fig:system1} shows a high-level picture if the \CFA runtime system in regards to concurrency. 2465 Each component of the picture is explained in detail in the flowing sections. 2466 2467 \begin{figure} 2468 \begin{center} 2469 {\resizebox{\textwidth}{!}{\input{system.pstex_t}}} 2470 \end{center} 2471 \caption{Overview of the entire system} 2472 \label{fig:system1} 2473 \end{figure} 2474 2475 \subsection{Processors} 2476 Parallelism in \CFA is built around using processors to specify how much parallelism is desired. \CFA processors are object wrappers around kernel threads, specifically @pthread@s in the current implementation of \CFA. 2477 Indeed, any parallelism must go through operating-system libraries. 2478 However, \textbf{uthread} are still the main source of concurrency, processors are simply the underlying source of parallelism. 2479 Indeed, processor \textbf{kthread} simply fetch a \textbf{uthread} from the scheduler and run it; they are effectively executers for user-threads. 2480 The main benefit of this approach is that it offers a well-defined boundary between kernel code and user code, for example, kernel thread quiescing, scheduling and interrupt handling. 2481 Processors internally use coroutines to take advantage of the existing context-switching semantics. 2482 2483 \subsection{Stack Management} 2484 One of the challenges of this system is to reduce the footprint as much as possible. 2485 Specifically, all @pthread@s created also have a stack created with them, which should be used as much as possible. 2486 Normally, coroutines also create their own stack to run on, however, in the case of the coroutines used for processors, these coroutines run directly on the \textbf{kthread} stack, effectively stealing the processor stack. 2487 The exception to this rule is the Main Processor, \ie the initial \textbf{kthread} that is given to any program. 2488 In order to respect C user expectations, the stack of the initial kernel thread, the main stack of the program, is used by the main user thread rather than the main processor, which can grow very large. 2489 2490 \subsection{Context Switching} 2491 As mentioned in section \ref{coroutine}, coroutines are a stepping stone for implementing threading, because they share the same mechanism for context-switching between different stacks. 2492 To improve performance and simplicity, context-switching is implemented using the following assumption: all context-switches happen inside a specific routine call. 2493 This assumption means that the context-switch only has to copy the callee-saved registers onto the stack and then switch the stack registers with the ones of the target coroutine/thread. 2494 Note that the instruction pointer can be left untouched since the context-switch is always inside the same routine 2495 Threads, however, do not context-switch between each other directly. 2496 They context-switch to the scheduler. 2497 This method is called a 2-step context-switch and has the advantage of having a clear distinction between user code and the kernel where scheduling and other system operations happen. 2498 Obviously, this doubles the context-switch cost because threads must context-switch to an intermediate stack. 2499 The alternative 1-step context-switch uses the stack of the ``from'' thread to schedule and then context-switches directly to the ``to'' thread. 2500 However, the performance of the 2-step context-switch is still superior to a @pthread_yield@ (see section \ref{results}). 2501 Additionally, for users in need for optimal performance, it is important to note that having a 2-step context-switch as the default does not prevent \CFA from offering a 1-step context-switch (akin to the Microsoft @SwitchToFiber@~\cite{switchToWindows} routine). 2502 This option is not currently present in \CFA, but the changes required to add it are strictly additive. 2503 2504 \subsection{Preemption} \label{preemption} 2505 Finally, an important aspect for any complete threading system is preemption. 2506 As mentioned in section \ref{basics}, preemption introduces an extra degree of uncertainty, which enables users to have multiple threads interleave transparently, rather than having to cooperate among threads for proper scheduling and CPU distribution. 2507 Indeed, preemption is desirable because it adds a degree of isolation among threads. 2508 In a fully cooperative system, any thread that runs a long loop can starve other threads, while in a preemptive system, starvation can still occur but it does not rely on every thread having to yield or block on a regular basis, which reduces significantly a programmer burden. 2509 Obviously, preemption is not optimal for every workload. 2510 However any preemptive system can become a cooperative system by making the time slices extremely large. 2511 Therefore, \CFA uses a preemptive threading system. 2512 2513 Preemption in \CFA\footnote{Note that the implementation of preemption is strongly tied with the underlying threading system. 2514 For this reason, only the Linux implementation is cover, \CFA does not run on Windows at the time of writting} is based on kernel timers, which are used to run a discrete-event simulation. 2515 Every processor keeps track of the current time and registers an expiration time with the preemption system. 2516 When the preemption system receives a change in preemption, it inserts the time in a sorted order and sets a kernel timer for the closest one, effectively stepping through preemption events on each signal sent by the timer. 2517 These timers use the Linux signal {\tt SIGALRM}, which is delivered to the process rather than the kernel-thread. 2518 This results in an implementation problem, because when delivering signals to a process, the kernel can deliver the signal to any kernel thread for which the signal is not blocked, \ie: 2079 2080 To improve performance and simplicity, context switching occur inside a routine call, so only callee-saved registers are copied onto the stack and then the stack register is switched; 2081 the corresponding registers are then restored for the other context. 2082 Note, the instruction pointer is untouched since the context switch is always inside the same routine. 2083 Unlike coroutines, threads do not context switch among each other; 2084 they context switch to the cluster scheduler. 2085 This method is a 2-step context-switch and provides a clear distinction between user and kernel code, where scheduling and other system operations happen. 2086 The alternative 1-step context-switch uses the \emph{from} thread's stack to schedule and then context-switches directly to the \emph{to} thread's stack. 2087 Experimental results (not presented) show the performance difference between these two approaches is virtually equivalent, because the 1-step performance is dominated by a lock instruction to prevent a race condition. 2088 2089 All kernel threads (@pthreads@) created a stack. 2090 Each \CFA virtual processor is implemented as a coroutine and these coroutines run directly on the kernel-thread stack, effectively stealing this stack. 2091 The exception to this rule is the program main, \ie the initial kernel thread that is given to any program. 2092 In order to respect C expectations, the stack of the initial kernel thread is used by program main rather than the main processor, allowing it to grow dynamically as in a normal C program. 2093 2094 Finally, an important aspect for a complete threading system is preemption, which introduces extra non-determinism via transparent interleaving, rather than cooperation among threads for proper scheduling and processor fairness from long-running threads. 2095 Because preemption frequency is usually long, 1 millisecond, performance cost is negligible. 2096 Preemption is normally handled by setting a count-down timer on each virtual processor. 2097 When the timer expires, an interrupt is delivered, and the interrupt handler resets the count-down timer, and if the virtual processor is executing in user code, the signal handler performs a user-level context-switch, or if executing in the language runtime-kernel, the preemption is ignored or rolled forward to the point where the runtime kernel context switches back to user code. 2098 Multiple signal handlers may be pending. 2099 When control eventually switches back to the signal handler, it returns normally, and execution continues in the interrupted user thread, even though the return from the signal handler may be on a different kernel thread than the one where the signal was delivered. 2100 The only issue with this approach is that signal masks from one kernel thread may be restored on another as part of returning from the signal handler; 2101 therefore, all virtual processors in a cluster need to have the same signal mask. 2102 2103 However, on current UNIX systems: 2519 2104 \begin{quote} 2520 2105 A process-directed signal may be delivered to any one of the threads that does not currently have the signal blocked. … … 2522 2107 SIGNAL(7) - Linux Programmer's Manual 2523 2108 \end{quote} 2524 For the sake of simplicity, and in order to prevent the case of having two threads receiving alarms simultaneously, \CFA programs block the {\tt SIGALRM} signal on every kernel thread except one. 2525 2526 Now because of how involuntary context-switches are handled, the kernel thread handling {\tt SIGALRM} cannot also be a processor thread. 2527 Hence, involuntary context-switching is done by sending signal {\tt SIGUSR1} to the corresponding proces\-sor and having the thread yield from inside the signal handler. 2528 This approach effectively context-switches away from the signal handler back to the kernel and the signal handler frame is eventually unwound when the thread is scheduled again. 2529 As a result, a signal handler can start on one kernel thread and terminate on a second kernel thread (but the same user thread). 2530 It is important to note that signal handlers save and restore signal masks because user-thread migration can cause a signal mask to migrate from one kernel thread to another. 2531 This behaviour is only a problem if all kernel threads, among which a user thread can migrate, differ in terms of signal masks\footnote{Sadly, official POSIX documentation is silent on what distinguishes ``async-signal-safe'' routines from other routines}. 2532 However, since the kernel thread handling preemption requires a different signal mask, executing user threads on the kernel-alarm thread can cause deadlocks. 2533 For this reason, the alarm thread is in a tight loop around a system call to @sigwaitinfo@, requiring very little CPU time for preemption. 2534 One final detail about the alarm thread is how to wake it when additional communication is required (\eg on thread termination). 2535 This unblocking is also done using {\tt SIGALRM}, but sent through the @pthread_sigqueue@. 2536 Indeed, @sigwait@ can differentiate signals sent from @pthread_sigqueue@ from signals sent from alarms or the kernel. 2537 2538 \subsection{Scheduler} 2539 Finally, an aspect that was not mentioned yet is the scheduling algorithm. 2540 Currently, the \CFA scheduler uses a single ready queue for all processors, which is the simplest approach to scheduling. 2541 Further discussion on scheduling is present in section \ref{futur:sched}. 2542 2543 % ====================================================================== 2544 % ====================================================================== 2545 \section{Internal Scheduling} \label{impl:intsched} 2546 % ====================================================================== 2547 % ====================================================================== 2548 The following figure is the traditional illustration of a monitor (repeated from page~\pageref{fig:ClassicalMonitor} for convenience): 2549 2550 \begin{figure} 2551 \begin{center} 2552 {\resizebox{0.4\textwidth}{!}{\input{monitor}}} 2553 \end{center} 2554 \caption{Traditional illustration of a monitor} 2555 \end{figure} 2556 2557 This picture has several components, the two most important being the entry queue and the AS-stack. 2558 The entry queue is an (almost) FIFO list where threads waiting to enter are parked, while the acceptor/signaller (AS) stack is a FILO list used for threads that have been signalled or otherwise marked as running next. 2559 2560 For \CFA, this picture does not have support for blocking multiple monitors on a single condition. 2561 To support bulk acquire two changes to this picture are required. 2562 First, it is no longer helpful to attach the condition to \emph{a single} monitor. 2563 Secondly, the thread waiting on the condition has to be separated across multiple monitors, seen in figure \ref{fig:monitor_cfa}. 2564 2565 \begin{figure} 2566 \begin{center} 2567 {\resizebox{0.8\textwidth}{!}{\input{int_monitor}}} 2568 \end{center} 2569 \caption{Illustration of \CFA Monitor} 2570 \label{fig:monitor_cfa} 2571 \end{figure} 2572 2573 This picture and the proper entry and leave algorithms (see listing \ref{f:entry2}) is the fundamental implementation of internal scheduling. 2574 Note that when a thread is moved from the condition to the AS-stack, it is conceptually split into N pieces, where N is the number of monitors specified in the parameter list. 2575 The thread is woken up when all the pieces have popped from the AS-stacks and made active. 2576 In this picture, the threads are split into halves but this is only because there are two monitors. 2577 For a specific signalling operation every monitor needs a piece of thread on its AS-stack. 2578 2579 \begin{figure} 2580 \begin{multicols}{2} 2581 Entry 2582 \begin{cfa} 2583 if monitor is free 2584 enter 2585 elif already own the monitor 2586 continue 2587 else 2588 block 2589 increment recursion 2590 2591 \end{cfa} 2592 \columnbreak 2593 Exit 2594 \begin{cfa} 2595 decrement recursion 2596 if recursion == 0 2597 if signal_stack not empty 2598 set_owner to thread 2599 if all monitors ready 2600 wake-up thread 2601 2602 if entry queue not empty 2603 wake-up thread 2604 \end{cfa} 2605 \end{multicols} 2606 \begin{cfa}[caption={Entry and exit routine for monitors with internal scheduling},label={f:entry2}] 2607 \end{cfa} 2608 \end{figure} 2609 2610 The solution discussed in \ref{s:InternalScheduling} can be seen in the exit routine of listing \ref{f:entry2}. 2611 Basically, the solution boils down to having a separate data structure for the condition queue and the AS-stack, and unconditionally transferring ownership of the monitors but only unblocking the thread when the last monitor has transferred ownership. 2612 This solution is deadlock safe as well as preventing any potential barging. 2613 The data structures used for the AS-stack are reused extensively for external scheduling, but in the case of internal scheduling, the data is allocated using variable-length arrays on the call stack of the @wait@ and @signal_block@ routines. 2614 2615 \begin{figure} 2616 \begin{center} 2617 {\resizebox{0.8\textwidth}{!}{\input{monitor_structs.pstex_t}}} 2618 \end{center} 2619 \caption{Data structures involved in internal/external scheduling} 2620 \label{fig:structs} 2621 \end{figure} 2622 2623 Figure \ref{fig:structs} shows a high-level representation of these data structures. 2624 The main idea behind them is that, a thread cannot contain an arbitrary number of intrusive ``next'' pointers for linking onto monitors. 2625 The @condition node@ is the data structure that is queued onto a condition variable and, when signalled, the condition queue is popped and each @condition criterion@ is moved to the AS-stack. 2626 Once all the criteria have been popped from their respective AS-stacks, the thread is woken up, which is what is shown in listing \ref{f:entry2}. 2627 2628 % ====================================================================== 2629 % ====================================================================== 2630 \section{External Scheduling} 2631 % ====================================================================== 2632 % ====================================================================== 2633 Similarly to internal scheduling, external scheduling for multiple monitors relies on the idea that waiting-thread queues are no longer specific to a single monitor, as mentioned in section \ref{extsched}. 2634 For internal scheduling, these queues are part of condition variables, which are still unique for a given scheduling operation (\ie no signal statement uses multiple conditions). 2635 However, in the case of external scheduling, there is no equivalent object which is associated with @waitfor@ statements. 2636 This absence means the queues holding the waiting threads must be stored inside at least one of the monitors that is acquired. 2637 These monitors being the only objects that have sufficient lifetime and are available on both sides of the @waitfor@ statement. 2638 This requires an algorithm to choose which monitor holds the relevant queue. 2639 It is also important that said algorithm be independent of the order in which users list parameters. 2640 The proposed algorithm is to fall back on monitor lock ordering (sorting by address) and specify that the monitor that is acquired first is the one with the relevant waiting queue. 2641 This assumes that the lock acquiring order is static for the lifetime of all concerned objects but that is a reasonable constraint. 2642 2643 This algorithm choice has two consequences: 2644 \begin{itemize} 2645 \item The queue of the monitor with the lowest address is no longer a true FIFO queue because threads can be moved to the front of the queue. 2646 These queues need to contain a set of monitors for each of the waiting threads. 2647 Therefore, another thread whose set contains the same lowest address monitor but different lower priority monitors may arrive first but enter the critical section after a thread with the correct pairing. 2648 \item The queue of the lowest priority monitor is both required and potentially unused. 2649 Indeed, since it is not known at compile time which monitor is the monitor which has the lowest address, every monitor needs to have the correct queues even though it is possible that some queues go unused for the entire duration of the program, for example if a monitor is only used in a specific pair. 2650 \end{itemize} 2651 Therefore, the following modifications need to be made to support external scheduling: 2652 \begin{itemize} 2653 \item The threads waiting on the entry queue need to keep track of which routine they are trying to enter, and using which set of monitors. 2654 The @mutex@ routine already has all the required information on its stack, so the thread only needs to keep a pointer to that information. 2655 \item The monitors need to keep a mask of acceptable routines. 2656 This mask contains for each acceptable routine, a routine pointer and an array of monitors to go with it. 2657 It also needs storage to keep track of which routine was accepted. 2658 Since this information is not specific to any monitor, the monitors actually contain a pointer to an integer on the stack of the waiting thread. 2659 Note that if a thread has acquired two monitors but executes a @waitfor@ with only one monitor as a parameter, setting the mask of acceptable routines to both monitors will not cause any problems since the extra monitor will not change ownership regardless. 2660 This becomes relevant when @when@ clauses affect the number of monitors passed to a @waitfor@ statement. 2661 \item The entry/exit routines need to be updated as shown in listing \ref{f:entry3}. 2662 \end{itemize} 2663 2664 \subsection{External Scheduling - Destructors} 2665 Finally, to support the ordering inversion of destructors, the code generation needs to be modified to use a special entry routine. 2666 This routine is needed because of the storage requirements of the call order inversion. 2667 Indeed, when waiting for the destructors, storage is needed for the waiting context and the lifetime of said storage needs to outlive the waiting operation it is needed for. 2668 For regular @waitfor@ statements, the call stack of the routine itself matches this requirement but it is no longer the case when waiting for the destructor since it is pushed on to the AS-stack for later. 2669 The @waitfor@ semantics can then be adjusted correspondingly, as seen in listing \ref{f:entry-dtor} 2670 2671 \begin{figure} 2672 \begin{multicols}{2} 2673 Entry 2674 \begin{cfa} 2675 if monitor is free 2676 enter 2677 elif already own the monitor 2678 continue 2679 elif matches waitfor mask 2680 push criteria to AS-stack 2681 continue 2682 else 2683 block 2684 increment recursion 2685 \end{cfa} 2686 \columnbreak 2687 Exit 2688 \begin{cfa} 2689 decrement recursion 2690 if recursion == 0 2691 if signal_stack not empty 2692 set_owner to thread 2693 if all monitors ready 2694 wake-up thread 2695 endif 2696 endif 2697 2698 if entry queue not empty 2699 wake-up thread 2700 endif 2701 \end{cfa} 2702 \end{multicols} 2703 \begin{cfa}[caption={Entry and exit routine for monitors with internal scheduling and external scheduling},label={f:entry3}] 2704 \end{cfa} 2705 \end{figure} 2706 2707 \begin{figure} 2708 \begin{multicols}{2} 2709 Destructor Entry 2710 \begin{cfa} 2711 if monitor is free 2712 enter 2713 elif already own the monitor 2714 increment recursion 2715 return 2716 create wait context 2717 if matches waitfor mask 2718 reset mask 2719 push self to AS-stack 2720 baton pass 2721 else 2722 wait 2723 increment recursion 2724 \end{cfa} 2725 \columnbreak 2726 Waitfor 2727 \begin{cfa} 2728 if matching thread is already there 2729 if found destructor 2730 push destructor to AS-stack 2731 unlock all monitors 2732 else 2733 push self to AS-stack 2734 baton pass 2735 endif 2736 return 2737 endif 2738 if non-blocking 2739 Unlock all monitors 2740 Return 2741 endif 2742 2743 push self to AS-stack 2744 set waitfor mask 2745 block 2746 return 2747 \end{cfa} 2748 \end{multicols} 2749 \begin{cfa}[caption={Pseudo code for the \protect\lstinline|waitfor| routine and the \protect\lstinline|mutex| entry routine for destructors},label={f:entry-dtor}] 2750 \end{cfa} 2751 \end{figure} 2752 2753 2754 % ====================================================================== 2755 % ====================================================================== 2756 \section{Putting It All Together} 2757 % ====================================================================== 2758 % ====================================================================== 2759 2760 2761 \section{Threads As Monitors} 2762 As it was subtly alluded in section \ref{threads}, @thread@s in \CFA are in fact monitors, which means that all monitor features are available when using threads. 2763 For example, here is a very simple two thread pipeline that could be used for a simulator of a game engine: 2764 \begin{figure} 2765 \begin{cfa}[caption={Toy simulator using \protect\lstinline|thread|s and \protect\lstinline|monitor|s.},label={f:engine-v1}] 2766 // Visualization declaration 2767 thread Renderer {} renderer; 2768 Frame * simulate( Simulator & this ); 2769 2770 // Simulation declaration 2771 thread Simulator{} simulator; 2772 void render( Renderer & this ); 2773 2774 // Blocking call used as communication 2775 void draw( Renderer & mutex this, Frame * frame ); 2776 2777 // Simulation loop 2778 void main( Simulator & this ) { 2779 while( true ) { 2780 Frame * frame = simulate( this ); 2781 draw( renderer, frame ); 2782 } 2783 } 2784 2785 // Rendering loop 2786 void main( Renderer & this ) { 2787 while( true ) { 2788 waitfor( draw, this ); 2789 render( this ); 2790 } 2791 } 2792 \end{cfa} 2793 \end{figure} 2794 One of the obvious complaints of the previous code snippet (other than its toy-like simplicity) is that it does not handle exit conditions and just goes on forever. 2795 Luckily, the monitor semantics can also be used to clearly enforce a shutdown order in a concise manner: 2796 \begin{figure} 2797 \begin{cfa}[caption={Same toy simulator with proper termination condition.},label={f:engine-v2}] 2798 // Visualization declaration 2799 thread Renderer {} renderer; 2800 Frame * simulate( Simulator & this ); 2801 2802 // Simulation declaration 2803 thread Simulator{} simulator; 2804 void render( Renderer & this ); 2805 2806 // Blocking call used as communication 2807 void draw( Renderer & mutex this, Frame * frame ); 2808 2809 // Simulation loop 2810 void main( Simulator & this ) { 2811 while( true ) { 2812 Frame * frame = simulate( this ); 2813 draw( renderer, frame ); 2814 2815 // Exit main loop after the last frame 2816 if( frame->is_last ) break; 2817 } 2818 } 2819 2820 // Rendering loop 2821 void main( Renderer & this ) { 2822 while( true ) { 2823 waitfor( draw, this ); 2824 or waitfor( ^?{}, this ) { 2825 // Add an exit condition 2826 break; 2827 } 2828 2829 render( this ); 2830 } 2831 } 2832 2833 // Call destructor for simulator once simulator finishes 2834 // Call destructor for renderer to signify shutdown 2835 \end{cfa} 2836 \end{figure} 2837 2838 \section{Fibers \& Threads} 2839 As mentioned in section \ref{preemption}, \CFA uses preemptive threads by default but can use fibers on demand. 2840 Currently, using fibers is done by adding the following line of code to the program~: 2841 \begin{cfa} 2842 unsigned int default_preemption() { 2843 return 0; 2844 } 2845 \end{cfa} 2846 This routine is called by the kernel to fetch the default preemption rate, where 0 signifies an infinite time-slice, \ie no preemption. 2847 However, once clusters are fully implemented, it will be possible to create fibers and \textbf{uthread} in the same system, as in listing \ref{f:fiber-uthread} 2848 \begin{figure} 2849 \lstset{language=CFA,deletedelim=**[is][]{`}{`}} 2850 \begin{cfa}[caption={Using fibers and \textbf{uthread} side-by-side in \CFA},label={f:fiber-uthread}] 2851 // Cluster forward declaration 2852 struct cluster; 2853 2854 // Processor forward declaration 2855 struct processor; 2856 2857 // Construct clusters with a preemption rate 2858 void ?{}(cluster& this, unsigned int rate); 2859 // Construct processor and add it to cluster 2860 void ?{}(processor& this, cluster& cluster); 2861 // Construct thread and schedule it on cluster 2862 void ?{}(thread& this, cluster& cluster); 2863 2864 // Declare two clusters 2865 cluster thread_cluster = { 10`ms }; // Preempt every 10 ms 2866 cluster fibers_cluster = { 0 }; // Never preempt 2867 2868 // Construct 4 processors 2869 processor processors[4] = { 2870 //2 for the thread cluster 2871 thread_cluster; 2872 thread_cluster; 2873 //2 for the fibers cluster 2874 fibers_cluster; 2875 fibers_cluster; 2876 }; 2877 2878 // Declares thread 2879 thread UThread {}; 2880 void ?{}(UThread& this) { 2881 // Construct underlying thread to automatically 2882 // be scheduled on the thread cluster 2883 (this){ thread_cluster } 2884 } 2885 2886 void main(UThread & this); 2887 2888 // Declares fibers 2889 thread Fiber {}; 2890 void ?{}(Fiber& this) { 2891 // Construct underlying thread to automatically 2892 // be scheduled on the fiber cluster 2893 (this.__thread){ fibers_cluster } 2894 } 2895 2896 void main(Fiber & this); 2897 \end{cfa} 2898 \end{figure} 2899 2900 2901 % ====================================================================== 2902 % ====================================================================== 2903 \section{Performance Results} \label{results} 2904 % ====================================================================== 2905 % ====================================================================== 2906 \section{Machine Setup} 2907 Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. 2908 All tests were made on this machine. 2109 Hence, the timer-expiry signal, which is generated \emph{externally} by the UNIX kernel to the UNIX process, is delivered to any of its UNIX subprocesses (kernel threads). 2110 To ensure each virtual processor receives its own preemption signals, a discrete-event simulation is run on a special virtual processor, and only it sets and receives timer events. 2111 Virtual processors register an expiration time with the discrete-event simulator, which is inserted in sorted order. 2112 The simulation sets the count-down timer to the value at the head of the event list, and when the timer expires, all events less than or equal to the current time are processed. 2113 Processing a preemption event sends an \emph{internal} @SIGUSR1@ signal to the registered virtual processor, which is always delivered to that processor. 2114 2115 2116 \section{Performance} 2117 \label{results} 2118 2119 To verify the implementation of the \CFA runtime, a series of microbenchmarks are performed comparing \CFA with other widely used programming languages with concurrency. 2120 Table~\ref{t:machine} shows the specifications of the computer used to run the benchmarks, and the versions of the software used in the comparison. 2121 2909 2122 \begin{table} 2910 \begin{center} 2911 \begin{tabular}{| l | r | l | r |} 2123 \centering 2124 \caption{Experiment environment} 2125 \label{t:machine} 2126 2127 \begin{tabular}{|l|r||l|r|} 2912 2128 \hline 2913 Architecture & x86\_64 & NUMA node(s) & 8 \\2129 Architecture & x86\_64 & NUMA node(s) & 8 \\ 2914 2130 \hline 2915 CPU op-mode(s) & 32-bit, 64-bit & Model name & AMD Opteron\texttrademark 2131 CPU op-mode(s) & 32-bit, 64-bit & Model name & AMD Opteron\texttrademark\ Processor 6380 \\ 2916 2132 \hline 2917 Byte Order & Little Endian & CPU Freq & 2.5 \si{\giga\hertz}\\2133 Byte Order & Little Endian & CPU Freq & 2.5 GHz \\ 2918 2134 \hline 2919 CPU(s) & 64 & L1d cache & \SI{16}{\kibi\byte}\\2135 CPU(s) & 64 & L1d cache & 16 KiB \\ 2920 2136 \hline 2921 Thread(s) per core & 2 & L1i cache & \SI{64}{\kibi\byte}\\2137 Thread(s) per core & 2 & L1i cache & 64 KiB \\ 2922 2138 \hline 2923 Core(s) per socket & 8 & L2 cache & \SI{2048}{\kibi\byte}\\2139 Core(s) per socket & 8 & L2 cache & 2048 KiB \\ 2924 2140 \hline 2925 Socket(s) & 4 & L3 cache & \SI{6144}{\kibi\byte}\\2141 Socket(s) & 4 & L3 cache & 6144 KiB \\ 2926 2142 \hline 2927 2143 \hline 2928 Operating system 2144 Operating system & Ubuntu 16.04.3 LTS & Kernel & Linux 4.4-97-generic \\ 2929 2145 \hline 2930 Compiler & GCC 6.3 & Translator & CFA 1\\2146 gcc & 6.3 & \CFA & 1.0.0 \\ 2931 2147 \hline 2932 Java version & OpenJDK-9 & Go version& 1.9.2 \\2148 Java & OpenJDK-9 & Go & 1.9.2 \\ 2933 2149 \hline 2934 2150 \end{tabular} 2935 \end{center}2936 \caption{Machine setup used for the tests}2937 \label{tab:machine}2938 2151 \end{table} 2939 2152 2940 \section{Micro Benchmarks} 2941 All benchmarks are run using the same harness to produce the results, seen as the @BENCH()@ macro in the following examples. 2942 This macro uses the following logic to benchmark the code: 2943 \begin{cfa} 2944 #define BENCH(run, result) \ 2945 before = gettime(); \ 2946 run; \ 2947 after = gettime(); \ 2948 result = (after - before) / N; 2949 \end{cfa} 2950 The method used to get time is @clock_gettime(CLOCK_THREAD_CPUTIME_ID);@. 2951 Each benchmark is using many iterations of a simple call to measure the cost of the call. 2952 The specific number of iterations depends on the specific benchmark. 2953 2954 \subsection{Context-Switching} 2955 The first interesting benchmark is to measure how long context-switches take. 2956 The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. 2957 Yielding causes the thread to context-switch to the scheduler and back, more precisely: from the \textbf{uthread} to the \textbf{kthread} then from the \textbf{kthread} back to the same \textbf{uthread} (or a different one in the general case). 2958 In order to make the comparison fair, coroutines also execute a 2-step context-switch by resuming another coroutine which does nothing but suspending in a tight loop, which is a resume/suspend cycle instead of a yield. 2959 Figure~\ref{f:ctx-switch} shows the code for coroutines and threads with the results in table \ref{tab:ctx-switch}. 2960 All omitted tests are functionally identical to one of these tests. 2961 The difference between coroutines and threads can be attributed to the cost of scheduling. 2153 All benchmarks are run using the following harness: 2154 \begin{cfa} 2155 unsigned int N = 10_000_000; 2156 #define BENCH( run ) Time before = getTimeNsec(); run; Duration result = (getTimeNsec() - before) / N; 2157 \end{cfa} 2158 The method used to get time is @clock_gettime( CLOCK_REALTIME )@. 2159 Each benchmark is performed @N@ times, where @N@ varies depending on the benchmark, the total time is divided by @N@ to obtain the average time for a benchmark. 2160 All omitted tests for other languages are functionally identical to the shown \CFA test. 2161 2162 2163 \paragraph{Context-Switching} 2164 2165 In procedural programming, the cost of a routine call is important as modularization (refactoring) increases. 2166 (In many cases, a compiler inlines routine calls to eliminate this cost.) 2167 Similarly, when modularization extends to coroutines/tasks, the time for a context switch becomes a relevant factor. 2168 The coroutine context-switch is 2-step using resume/suspend, \ie from resumer to suspender and from suspender to resumer. 2169 The thread context switch is 2-step using yield, \ie enter and return from the runtime kernel. 2170 Figure~\ref{f:ctx-switch} shows the code for coroutines/threads with all results in Table~\ref{tab:ctx-switch}. 2171 The difference in performance between coroutine and thread context-switch is the cost of scheduling for threads, whereas coroutines are self-scheduling. 2172 2962 2173 \begin{figure} 2963 \ begin{multicols}{2}2964 \CFA Coroutines 2965 \ begin{cfa}2966 coroutine GreatSuspender {}; 2967 void main(GreatSuspender& this) { 2968 while(true) { suspend(); } 2969 }2174 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2175 2176 \newbox\myboxA 2177 \begin{lrbox}{\myboxA} 2178 \begin{cfa}[aboveskip=0pt,belowskip=0pt] 2179 coroutine C {} c; 2180 void main( C & ) { for ( ;; ) { @suspend();@ } } 2970 2181 int main() { 2971 GreatSuspender s;2972 resume(s);2973 2182 BENCH( 2974 for(size_t i=0; i<n; i++) { 2975 resume(s); 2976 }, 2977 result 2978 ) 2979 printf("%llu\n", result); 2980 } 2981 \end{cfa} 2982 \columnbreak 2983 \CFA Threads 2984 \begin{cfa} 2985 2986 2183 for ( size_t i = 0; i < N; i += 1 ) { @resume( c );@ } ) 2184 sout | result`ns | endl; 2185 } 2186 \end{cfa} 2187 \end{lrbox} 2188 2189 \newbox\myboxB 2190 \begin{lrbox}{\myboxB} 2191 \begin{cfa}[aboveskip=0pt,belowskip=0pt] 2987 2192 2988 2193 2989 2194 int main() { 2990 2991 2992 2195 BENCH( 2993 for(size_t i=0; i<n; i++) { 2994 yield(); 2995 }, 2996 result 2997 ) 2998 printf("%llu\n", result); 2999 } 3000 \end{cfa} 3001 \end{multicols} 3002 \begin{cfa}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={f:ctx-switch}] 3003 \end{cfa} 3004 \end{figure} 3005 3006 \begin{table} 3007 \begin{center} 3008 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |} 2196 for ( size_t i = 0; i < N; i += 1 ) { @yield();@ } ) 2197 sout | result`ns | endl; 2198 } 2199 \end{cfa} 2200 \end{lrbox} 2201 2202 \subfloat[Coroutine]{\usebox\myboxA} 2203 \quad 2204 \subfloat[Thread]{\usebox\myboxB} 2205 \captionof{figure}{\CFA context-switch benchmark} 2206 \label{f:ctx-switch} 2207 2208 \centering 2209 2210 \captionof{table}{Context switch comparison (nanoseconds)} 2211 \label{tab:ctx-switch} 2212 \bigskip 2213 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}} 3009 2214 \cline{2-4} 3010 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\2215 \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{Median} &\multicolumn{1}{c|}{Average} & \multicolumn{1}{c|}{Std Dev} \\ 3011 2216 \hline 3012 Kernel Thread & 241.5 & 243.86 & 5.08 \\2217 Kernel Thread & 241.5 & 243.86 & 5.08 \\ 3013 2218 \CFA Coroutine & 38 & 38 & 0 \\ 3014 2219 \CFA Thread & 103 & 102.96 & 2.96 \\ 3015 \uC Coroutine & 46 & 45.86 & 0.35 \\3016 \uC Thread & 98 & 99.11 & 1.42 \\2220 \uC Coroutine & 46 & 45.86 & 0.35 \\ 2221 \uC Thread & 98 & 99.11 & 1.42 \\ 3017 2222 Goroutine & 150 & 149.96 & 3.16 \\ 3018 2223 Java Thread & 289 & 290.68 & 8.72 \\ 3019 2224 \hline 3020 2225 \end{tabular} 3021 \end{center} 3022 \caption{Context Switch comparison. 3023 All numbers are in nanoseconds(\si{\nano\second})} 3024 \label{tab:ctx-switch} 3025 \end{table} 3026 3027 \subsection{Mutual-Exclusion} 3028 The next interesting benchmark is to measure the overhead to enter/leave a critical-section. 3029 For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. 3030 Figure~\ref{f:mutex} shows the code for \CFA. 3031 To put the results in context, the cost of entering a non-inline routine and the cost of acquiring and releasing a @pthread_mutex@ lock is also measured. 3032 The results can be shown in table \ref{tab:mutex}. 3033 3034 \begin{figure} 3035 \begin{cfa}[caption={\CFA benchmark code used to measure mutex routines.},label={f:mutex}] 3036 monitor M {}; 3037 void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {} 3038 2226 2227 \bigskip 2228 \bigskip 2229 2230 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2231 \begin{cfa} 2232 monitor M {} m1/*, m2, m3, m4*/; 2233 void __attribute__((noinline)) do_call( M & mutex m/*, m2, m3, m4*/ ) {} 3039 2234 int main() { 3040 M m/*, m2, m3, m4*/; 3041 BENCH( 3042 for(size_t i=0; i<n; i++) { 3043 call(m/*, m2, m3, m4*/); 3044 }, 3045 result 3046 ) 3047 printf("%llu\n", result); 3048 } 3049 \end{cfa} 3050 \end{figure} 3051 3052 \begin{table} 3053 \begin{center} 3054 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |} 2235 BENCH( for( size_t i = 0; i < N; i += 1 ) { @do_call( m1/*, m2, m3, m4*/ );@ } ) 2236 sout | result`ns | endl; 2237 } 2238 \end{cfa} 2239 \captionof{figure}{\CFA acquire/release mutex benchmark} 2240 \label{f:mutex} 2241 2242 \centering 2243 2244 \captionof{table}{Mutex comparison (nanoseconds)} 2245 \label{tab:mutex} 2246 \bigskip 2247 2248 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}} 3055 2249 \cline{2-4} 3056 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\2250 \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{Median} &\multicolumn{1}{c|}{Average} & \multicolumn{1}{c|}{Std Dev} \\ 3057 2251 \hline 3058 C routine & 2 & 2& 0 \\3059 FetchAdd + FetchSub & 26 & 26 & 0 \\3060 Pthreads Mutex Lock & 31 & 31.86& 0.99 \\2252 C routine & 2 & 2 & 0 \\ 2253 FetchAdd + FetchSub & 26 & 26 & 0 \\ 2254 Pthreads Mutex Lock & 31 & 31.86 & 0.99 \\ 3061 2255 \uC @monitor@ member routine & 30 & 30 & 0 \\ 3062 \CFA @mutex@ routine, 1 argument & 41 & 41.57 & 0.9 \\3063 \CFA @mutex@ routine, 2 argument & 76 & 76.96 & 1.57 \\2256 \CFA @mutex@ routine, 1 argument & 41 & 41.57 & 0.9 \\ 2257 \CFA @mutex@ routine, 2 argument & 76 & 76.96 & 1.57 \\ 3064 2258 \CFA @mutex@ routine, 4 argument & 145 & 146.68 & 3.85 \\ 3065 Java synchronized routine & 27 & 28.57 & 2.6 \\2259 Java synchronized routine & 27 & 28.57 & 2.6 \\ 3066 2260 \hline 3067 2261 \end{tabular} 3068 \end{center} 3069 \caption{Mutex routine comparison. 3070 All numbers are in nanoseconds(\si{\nano\second})} 3071 \label{tab:mutex} 3072 \end{table} 3073 3074 \subsection{Internal Scheduling} 3075 The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. 3076 Figure~\ref{f:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. 3077 As with all other benchmarks, all omitted tests are functionally identical to one of these tests. 2262 \end{figure} 2263 2264 2265 \paragraph{Mutual-Exclusion} 2266 2267 Mutual exclusion is measured by entering/leaving a critical section. 2268 For monitors, entering and leaving a monitor routine is measured. 2269 Figure~\ref{f:mutex} shows the code for \CFA with all results in Table~\ref{tab:mutex}. 2270 To put the results in context, the cost of entering a non-inline routine and the cost of acquiring and releasing a @pthread_mutex@ lock is also measured. 2271 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2272 2273 2274 \paragraph{Internal Scheduling} 2275 2276 Internal scheduling is measured by waiting on and signalling a condition variable. 2277 Figure~\ref{f:int-sched} shows the code for \CFA, with results in Table~\ref{tab:int-sched}. 2278 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 3078 2279 3079 2280 \begin{figure} 3080 \begin{cfa}[caption={Benchmark code for internal scheduling},label={f:int-sched}] 2281 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2282 \begin{cfa} 3081 2283 volatile int go = 0; 3082 2284 condition c; 3083 monitor M {}; 3084 M m1; 3085 3086 void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); } 3087 2285 monitor M {} m; 2286 void __attribute__((noinline)) do_call( M & mutex a1 ) { signal( c ); } 3088 2287 thread T {}; 3089 void ^?{}( T & mutex this ) {}3090 2288 void main( T & this ) { 3091 while(go == 0) { yield(); } 3092 while(go == 1) { do_call(m1); } 3093 } 3094 int __attribute__((noinline)) do_wait( M & mutex a1 ) { 3095 go = 1; 3096 BENCH( 3097 for(size_t i=0; i<n; i++) { 3098 wait(c); 3099 }, 3100 result 3101 ) 3102 printf("%llu\n", result); 3103 go = 0; 3104 return 0; 2289 while ( go == 0 ) { yield(); } // wait for other thread to start 2290 while ( go == 1 ) { @do_call( m );@ } 2291 } 2292 int __attribute__((noinline)) do_wait( M & mutex m ) { 2293 go = 1; // continue other thread 2294 BENCH( for ( size_t i = 0; i < N; i += 1 ) { @wait( c );@ } ); 2295 go = 0; // stop other thread 2296 sout | result`ns | endl; 3105 2297 } 3106 2298 int main() { 3107 2299 T t; 3108 return do_wait(m1); 3109 } 3110 \end{cfa} 3111 \end{figure} 3112 3113 \begin{table} 3114 \begin{center} 3115 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |} 2300 do_wait( m ); 2301 } 2302 \end{cfa} 2303 \captionof{figure}{\CFA Internal scheduling benchmark} 2304 \label{f:int-sched} 2305 2306 \centering 2307 \captionof{table}{Internal scheduling comparison (nanoseconds)} 2308 \label{tab:int-sched} 2309 \bigskip 2310 2311 \begin{tabular}{|r|*{3}{D{.}{.}{5.2}|}} 3116 2312 \cline{2-4} 3117 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\2313 \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{Median} &\multicolumn{1}{c|}{Average} & \multicolumn{1}{c|}{Std Dev} \\ 3118 2314 \hline 3119 Pthreads Condition Variable 3120 \uC @signal@ & 322 & 323 & 3.36 \\3121 \CFA @signal@, 1 @monitor@ & 352.5& 353.11 & 3.66 \\3122 \CFA @signal@, 2 @monitor@ & 430 & 430.29 & 8.97 \\3123 \CFA @signal@, 4 @monitor@ & 594.5& 606.57 & 18.33 \\3124 Java @notify@ & 13831.5 & 15698.21 & 4782.3 \\2315 Pthreads Condition Variable & 5902.5 & 6093.29 & 714.78 \\ 2316 \uC @signal@ & 322 & 323 & 3.36 \\ 2317 \CFA @signal@, 1 @monitor@ & 352.5 & 353.11 & 3.66 \\ 2318 \CFA @signal@, 2 @monitor@ & 430 & 430.29 & 8.97 \\ 2319 \CFA @signal@, 4 @monitor@ & 594.5 & 606.57 & 18.33 \\ 2320 Java @notify@ & 13831.5 & 15698.21 & 4782.3 \\ 3125 2321 \hline 3126 2322 \end{tabular} 3127 \end{center} 3128 \caption{Internal scheduling comparison. 3129 All numbers are in nanoseconds(\si{\nano\second})} 3130 \label{tab:int-sched} 3131 \end{table} 3132 3133 \subsection{External Scheduling} 3134 The Internal scheduling benchmark measures the cost of the @waitfor@ statement (@_Accept@ in \uC). 3135 Figure~\ref{f:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. 3136 As with all other benchmarks, all omitted tests are functionally identical to one of these tests. 2323 \end{figure} 2324 2325 2326 \paragraph{External Scheduling} 2327 2328 External scheduling is measured by accepting a call using the @waitfor@ statement (@_Accept@ in \uC). 2329 Figure~\ref{f:ext-sched} shows the code for \CFA, with results in Table~\ref{tab:ext-sched}. 2330 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 3137 2331 3138 2332 \begin{figure} 3139 \begin{cfa}[caption={Benchmark code for external scheduling},label={f:ext-sched}] 2333 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2334 \begin{cfa} 3140 2335 volatile int go = 0; 3141 monitor M {}; 3142 M m1; 2336 monitor M {} m; 3143 2337 thread T {}; 3144 3145 void __attribute__((noinline)) do_call( M & mutex a1 ) {} 3146 3147 void ^?{}( T & mutex this ) {} 3148 void main( T & this ) { 3149 while(go == 0) { yield(); } 3150 while(go == 1) { do_call(m1); } 3151 } 3152 int __attribute__((noinline)) do_wait( M & mutex a1 ) { 3153 go = 1; 3154 BENCH( 3155 for(size_t i=0; i<n; i++) { 3156 waitfor(call, a1); 3157 }, 3158 result 3159 ) 3160 printf("%llu\n", result); 3161 go = 0; 3162 return 0; 2338 void __attribute__((noinline)) do_call( M & mutex ) {} 2339 void main( T & ) { 2340 while ( go == 0 ) { yield(); } // wait for other thread to start 2341 while ( go == 1 ) { @do_call( m );@ } 2342 } 2343 int __attribute__((noinline)) do_wait( M & mutex m ) { 2344 go = 1; // continue other thread 2345 BENCH( for ( size_t i = 0; i < N; i += 1 ) { @waitfor( do_call, m );@ } ) 2346 go = 0; // stop other thread 2347 sout | result`ns | endl; 3163 2348 } 3164 2349 int main() { 3165 2350 T t; 3166 return do_wait(m1); 3167 } 3168 \end{cfa} 3169 \end{figure} 3170 3171 \begin{table} 3172 \begin{center} 3173 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |} 2351 do_wait( m ); 2352 } 2353 \end{cfa} 2354 \captionof{figure}{\CFA external scheduling benchmark} 2355 \label{f:ext-sched} 2356 2357 \centering 2358 2359 \captionof{table}{External scheduling comparison (nanoseconds)} 2360 \label{tab:ext-sched} 2361 \bigskip 2362 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}} 3174 2363 \cline{2-4} 3175 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\2364 \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{Median} &\multicolumn{1}{c|}{Average} & \multicolumn{1}{c|}{Std Dev} \\ 3176 2365 \hline 3177 \uC @ Accept@& 350 & 350.61 & 3.11 \\3178 \CFA @waitfor@, 1 @monitor@ & 358.5 & 358.36 & 3.82 \\2366 \uC @_Accept@ & 350 & 350.61 & 3.11 \\ 2367 \CFA @waitfor@, 1 @monitor@ & 358.5 & 358.36 & 3.82 \\ 3179 2368 \CFA @waitfor@, 2 @monitor@ & 422 & 426.79 & 7.95 \\ 3180 \CFA @waitfor@, 4 @monitor@ & 579.5 & 585.46 & 11.25 \\2369 \CFA @waitfor@, 4 @monitor@ & 579.5 & 585.46 & 11.25 \\ 3181 2370 \hline 3182 2371 \end{tabular} 3183 \end{center} 3184 \caption{External scheduling comparison. 3185 All numbers are in nanoseconds(\si{\nano\second})} 3186 \label{tab:ext-sched} 3187 \end{table} 3188 3189 3190 \subsection{Object Creation} 3191 Finally, the last benchmark measures the cost of creation for concurrent objects. 3192 Figure~\ref{f:creation} shows the code for @pthread@s and \CFA threads, with results shown in table \ref{tab:creation}. 3193 As with all other benchmarks, all omitted tests are functionally identical to one of these tests. 3194 The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low. 3195 3196 \begin{figure} 3197 \begin{center} 3198 @pthread@ 3199 \begin{cfa} 2372 2373 \bigskip 2374 \medskip 2375 2376 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2377 \begin{cfa} 2378 thread MyThread {}; 2379 void main( MyThread & ) {} 3200 2380 int main() { 3201 BENCH( 3202 for(size_t i=0; i<n; i++) { 3203 pthread_t thread; 3204 if(pthread_create(&thread,NULL,foo,NULL)<0) { 3205 perror( "failure" ); 3206 return 1; 3207 } 3208 3209 if(pthread_join(thread, NULL)<0) { 3210 perror( "failure" ); 3211 return 1; 3212 } 3213 }, 3214 result 3215 ) 3216 printf("%llu\n", result); 3217 } 3218 \end{cfa} 3219 3220 3221 3222 \CFA Threads 3223 \begin{cfa} 3224 int main() { 3225 BENCH( 3226 for(size_t i=0; i<n; i++) { 3227 MyThread m; 3228 }, 3229 result 3230 ) 3231 printf("%llu\n", result); 3232 } 3233 \end{cfa} 3234 \end{center} 3235 \caption{Benchmark code for \protect\lstinline|pthread|s and \CFA to measure object creation} 2381 BENCH( for ( size_t i = 0; i < N; i += 1 ) { @MyThread m;@ } ) 2382 sout | result`ns | endl; 2383 } 2384 \end{cfa} 2385 \captionof{figure}{\CFA object creation benchmark} 3236 2386 \label{f:creation} 3237 \end{figure} 3238 3239 \begin{table} 3240 \begin{center} 3241 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |} 2387 2388 \centering 2389 2390 \captionof{table}{Creation comparison (nanoseconds)} 2391 \label{tab:creation} 2392 \bigskip 2393 2394 \begin{tabular}{|r|*{3}{D{.}{.}{5.2}|}} 3242 2395 \cline{2-4} 3243 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\2396 \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{Median} & \multicolumn{1}{c|}{Average} & \multicolumn{1}{c|}{Std Dev} \\ 3244 2397 \hline 3245 Pthreads & 26996& 26984.71 & 156.6 \\3246 \CFA Coroutine Lazy & 6 & 5.71& 0.45 \\2398 Pthreads & 26996 & 26984.71 & 156.6 \\ 2399 \CFA Coroutine Lazy & 6 & 5.71 & 0.45 \\ 3247 2400 \CFA Coroutine Eager & 708 & 706.68 & 4.82 \\ 3248 \CFA Thread & 1173.5 & 1176.18 & 15.18 \\3249 \uC Coroutine & 109 & 107.46 & 1.74 \\3250 \uC Thread & 526 & 530.89 & 9.73 \\3251 Goroutine & 2520.5 & 2530.93 & 61,56 \\3252 Java Thread & 91114.5 & 92272.79 & 961.58 \\2401 \CFA Thread & 1173.5 & 1176.18 & 15.18 \\ 2402 \uC Coroutine & 109 & 107.46 & 1.74 \\ 2403 \uC Thread & 526 & 530.89 & 9.73 \\ 2404 Goroutine & 2520.5 & 2530.93 & 61.56 \\ 2405 Java Thread & 91114.5 & 92272.79 & 961.58 \\ 3253 2406 \hline 3254 2407 \end{tabular} 3255 \end{center} 3256 \caption{Creation comparison. 3257 All numbers are in nanoseconds(\si{\nano\second}).} 3258 \label{tab:creation} 3259 \end{table} 3260 2408 \end{figure} 2409 2410 2411 \paragraph{Object Creation} 2412 2413 Object creation is measured by creating/deleting the specific kind of concurrent object. 2414 Figure~\ref{f:creation} shows the code for \CFA, with results in Table~\ref{tab:creation}. 2415 The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine to force stack creation, the creation cost is artificially low. 3261 2416 3262 2417 3263 2418 \section{Conclusion} 3264 This paper has achieved a minimal concurrency \textbf{api} that is simple, efficient and usable as the basis for higher-level features. 3265 The approach presented is based on a lightweight thread-system for parallelism, which sits on top of clusters of processors. 3266 This M:N model is judged to be both more efficient and allow more flexibility for users. 3267 Furthermore, this document introduces monitors as the main concurrency tool for users. 3268 This paper also offers a novel approach allowing multiple monitors to be accessed simultaneously without running into the Nested Monitor Problem~\cite{Lister77}. 3269 It also offers a full implementation of the concurrency runtime written entirely in \CFA, effectively the largest \CFA code base to date. 3270 3271 3272 % ====================================================================== 3273 % ====================================================================== 2419 2420 This paper demonstrate a concurrency API that is simple, efficient, and able to build higher-level concurrency features. 2421 The approach provides concurrency based on a preemptive M:N user-level threading-system, executing in clusters, which encapsulate scheduling of work on multiple kernel threads providing parallelism. 2422 The M:N model is judged to be efficient and provide greater flexibility than a 1:1 threading model. 2423 High-level objects (monitor/task) are the core mechanism for mutual exclusion and synchronization. 2424 A novel aspect is allowing multiple mutex-objects to be accessed simultaneously reducing the potential for deadlock for this complex scenario. 2425 These concepts and the entire \CFA runtime-system are written in the \CFA language, demonstrating the expressiveness of the \CFA language. 2426 Performance comparisons with other concurrent systems/languages show the \CFA approach is competitive across all low-level operations, which translates directly into good performance in well-written concurrent applications. 2427 C programmers should feel comfortable using these mechanisms for developing concurrent applications, with the ability to obtain maximum available performance by mechanisms at the appropriate level. 2428 2429 3274 2430 \section{Future Work} 3275 % ====================================================================== 3276 % ====================================================================== 3277 3278 \subsection{Performance} \label{futur:perf} 3279 This paper presents a first implementation of the \CFA concurrency runtime. 3280 Therefore, there is still significant work to improve performance. 3281 Many of the data structures and algorithms may change in the future to more efficient versions. 3282 For example, the number of monitors in a single bulk acquire is only bound by the stack size, this is probably unnecessarily generous. 3283 It may be possible that limiting the number helps increase performance. 3284 However, it is not obvious that the benefit would be significant. 3285 3286 \subsection{Flexible Scheduling} \label{futur:sched} 2431 2432 While concurrency in \CFA has a strong start, development is still underway and there are missing features. 2433 2434 \paragraph{Flexible Scheduling} 2435 \label{futur:sched} 2436 3287 2437 An important part of concurrency is scheduling. 3288 2438 Different scheduling algorithms can affect performance (both in terms of average and variation). 3289 2439 However, no single scheduler is optimal for all workloads and therefore there is value in being able to change the scheduler for given programs. 3290 One solution is to offer various tweaking options to users, allowing the scheduler to be adjusted to the requirements of the workload. 3291 However, in order to be truly flexible, it would be interesting to allow users to add arbitrary data and arbitrary scheduling algorithms. 3292 For example, a web server could attach Type-of-Service information to threads and have a ``ToS aware'' scheduling algorithm tailored to this specific web server. 3293 This path of flexible schedulers will be explored for \CFA. 3294 3295 \subsection{Non-Blocking I/O} \label{futur:nbio} 3296 While most of the parallelism tools are aimed at data parallelism and control-flow parallelism, many modern workloads are not bound on computation but on IO operations, a common case being web servers and XaaS (anything as a service). 3297 These types of workloads often require significant engineering around amortizing costs of blocking IO operations. 3298 At its core, non-blocking I/O is an operating system level feature that allows queuing IO operations (\eg network operations) and registering for notifications instead of waiting for requests to complete. 3299 In this context, the role of the language makes Non-Blocking IO easily available and with low overhead. 3300 The current trend is to use asynchronous programming using tools like callbacks and/or futures and promises, which can be seen in frameworks like Node.js~\cite{NodeJs} for JavaScript, Spring MVC~\cite{SpringMVC} for Java and Django~\cite{Django} for Python. 3301 However, while these are valid solutions, they lead to code that is harder to read and maintain because it is much less linear. 3302 3303 \subsection{Other Concurrency Tools} \label{futur:tools} 3304 While monitors offer a flexible and powerful concurrent core for \CFA, other concurrency tools are also necessary for a complete multi-paradigm concurrency package. 3305 Examples of such tools can include simple locks and condition variables, futures and promises~\cite{promises}, executors and actors. 2440 One solution is to offer various tweaking options, allowing the scheduler to be adjusted to the requirements of the workload. 2441 However, to be truly flexible, it is necessary to have a pluggable scheduler. 2442 Currently, the \CFA pluggable scheduler is too simple to handle complex scheduling, \eg quality of service and real-time, where the scheduler must interact with mutex objects to deal with issues like priority inversion. 2443 2444 \paragraph{Non-Blocking I/O} 2445 \label{futur:nbio} 2446 2447 Many modern workloads are not bound by computation but IO operations, a common case being web servers and XaaS~\cite{XaaS} (anything as a service). 2448 These types of workloads require significant engineering to amortizing costs of blocking IO-operations. 2449 At its core, non-blocking I/O is an operating-system level feature queuing IO operations, \eg network operations, and registering for notifications instead of waiting for requests to complete. 2450 Current trends use asynchronous programming like callbacks, futures, and/or promises, \eg Node.js~\cite{NodeJs} for JavaScript, Spring MVC~\cite{SpringMVC} for Java, and Django~\cite{Django} for Python. 2451 However, these solutions lead to code that is hard create, read and maintain. 2452 A better approach is to tie non-blocking I/O into the concurrency system to provide ease of use with low overhead, \eg thread-per-connection web-services. 2453 A non-blocking I/O library is currently under development for \CFA. 2454 2455 \paragraph{Other Concurrency Tools} 2456 \label{futur:tools} 2457 2458 While monitors offer a flexible and powerful concurrent for \CFA, other concurrency tools are also necessary for a complete multi-paradigm concurrency package. 2459 Examples of such tools can include futures and promises~\cite{promises}, executors and actors. 3306 2460 These additional features are useful when monitors offer a level of abstraction that is inadequate for certain tasks. 3307 2461 3308 \subsection{Implicit Threading} \label{futur:implcit} 3309 Simpler applications can benefit greatly from having implicit parallelism. 3310 That is, parallelism that does not rely on the user to write concurrency. 3311 This type of parallelism can be achieved both at the language level and at the library level. 3312 The canonical example of implicit parallelism is parallel for loops, which are the simplest example of a divide and conquer algorithms~\cite{uC++book}. 3313 Table \ref{f:parfor} shows three different code examples that accomplish point-wise sums of large arrays. 3314 Note that none of these examples explicitly declare any concurrency or parallelism objects. 3315 3316 \begin{table} 3317 \begin{center} 3318 \begin{tabular}[t]{|c|c|c|} 3319 Sequential & Library Parallel & Language Parallel \\ 3320 \begin{cfa}[tabsize=3] 3321 void big_sum( 3322 int* a, int* b, 3323 int* o, 3324 size_t len) 3325 { 3326 for( 3327 int i = 0; 3328 i < len; 3329 ++i ) 3330 { 3331 o[i]=a[i]+b[i]; 3332 } 3333 } 3334 3335 3336 3337 3338 3339 int* a[10000]; 3340 int* b[10000]; 3341 int* c[10000]; 3342 //... fill in a & b 3343 big_sum(a,b,c,10000); 3344 \end{cfa} &\begin{cfa}[tabsize=3] 3345 void big_sum( 3346 int* a, int* b, 3347 int* o, 3348 size_t len) 3349 { 3350 range ar(a, a+len); 3351 range br(b, b+len); 3352 range or(o, o+len); 3353 parfor( ai, bi, oi, 3354 []( int* ai, 3355 int* bi, 3356 int* oi) 3357 { 3358 oi=ai+bi; 3359 }); 3360 } 3361 3362 3363 int* a[10000]; 3364 int* b[10000]; 3365 int* c[10000]; 3366 //... fill in a & b 3367 big_sum(a,b,c,10000); 3368 \end{cfa}&\begin{cfa}[tabsize=3] 3369 void big_sum( 3370 int* a, int* b, 3371 int* o, 3372 size_t len) 3373 { 3374 parfor (ai,bi,oi) 3375 in (a, b, o ) 3376 { 3377 oi = ai + bi; 3378 } 3379 } 3380 3381 3382 3383 3384 3385 3386 3387 int* a[10000]; 3388 int* b[10000]; 3389 int* c[10000]; 3390 //... fill in a & b 3391 big_sum(a,b,c,10000); 3392 \end{cfa} 3393 \end{tabular} 3394 \end{center} 3395 \caption{For loop to sum numbers: Sequential, using library parallelism and language parallelism.} 3396 \label{f:parfor} 3397 \end{table} 3398 3399 Implicit parallelism is a restrictive solution and therefore has its limitations. 3400 However, it is a quick and simple approach to parallelism, which may very well be sufficient for smaller applications and reduces the amount of boilerplate needed to start benefiting from parallelism in modern CPUs. 3401 3402 3403 % A C K N O W L E D G E M E N T S 3404 % ------------------------------- 2462 \paragraph{Implicit Threading} 2463 \label{futur:implcit} 2464 2465 Basic concurrent (embarrassingly parallel) applications can benefit greatly from implicit concurrency, where sequential programs are converted to concurrent, possibly with some help from pragmas to guide the conversion. 2466 This type of concurrency can be achieved both at the language level and at the library level. 2467 The canonical example of implicit concurrency is concurrent nested @for@ loops, which are amenable to divide and conquer algorithms~\cite{uC++book}. 2468 The \CFA language features should make it possible to develop a reasonable number of implicit concurrency mechanism to solve basic HPC data-concurrency problems. 2469 However, implicit concurrency is a restrictive solution and has its limitations, so it can never replace explicit concurrent programming. 2470 2471 3405 2472 \section{Acknowledgements} 3406 2473 3407 Thanks to Aaron Moss, Rob Schluntz and Andrew Beach for their work on the \CFA project as well as all the discussions which helped concretize the ideas in this paper. 3408 Partial funding was supplied by the Natural Sciences and Engineering Research Council of Canada and a corporate partnership with Huawei Ltd. 3409 3410 3411 % B I B L I O G R A P H Y 3412 % ----------------------------- 3413 %\bibliographystyle{plain} 2474 The authors would like to recognize the design assistance of Aaron Moss, Rob Schluntz and Andrew Beach on the features described in this paper. 2475 Funding for this project has been provided by Huawei Ltd.\ (\url{http://www.huawei.com}), and Peter Buhr is partially funded by the Natural Sciences and Engineering Research Council of Canada. 2476 2477 {% 2478 \fontsize{9bp}{12bp}\selectfont% 3414 2479 \bibliography{pl,local} 3415 2480 }% 3416 2481 3417 2482 \end{document} -
doc/papers/concurrency/annex/local.bib
ra12c81f3 rc653b37 46 46 title = {Thread Building Blocks}, 47 47 howpublished= {Intel, \url{https://www.threadingbuildingblocks.org}}, 48 note = {Accessed: 2018-3},48 optnote = {Accessed: 2018-3}, 49 49 } 50 50 -
doc/papers/concurrency/figures/ext_monitor.fig
ra12c81f3 rc653b37 8 8 -2 9 9 1200 2 10 5 1 0 1 -1 -1 0 0 -1 0.000 0 1 0 0 3150.000 3450.000 3150 3150 2850 3450 3150375011 5 1 0 1 -1 -1 0 0 -1 0.000 0 1 0 0 3150.000 4350.000 3150 4050 2850 4350 3150465012 6 5850 1950 6150225013 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6000 2100 105 105 6000 2100 6105220514 4 1 -1 0 0 0 10 0.0000 2 105 90 60002160 d\00110 5 1 0 1 -1 -1 0 0 -1 0.000 0 1 0 0 1575.000 3450.000 1575 3150 1275 3450 1575 3750 11 5 1 0 1 -1 -1 0 0 -1 0.000 0 1 0 0 1575.000 4350.000 1575 4050 1275 4350 1575 4650 12 6 4275 1950 4575 2250 13 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4425 2100 105 105 4425 2100 4530 2205 14 4 1 -1 0 0 0 10 0.0000 2 105 90 4425 2160 d\001 15 15 -6 16 6 5100 2100 5400 240017 1 3 0 1 -1 -1 1 0 4 0.000 1 0.0000 5250 2250 105 105 5250 2250 5355 225018 4 1 -1 0 0 0 10 0.0000 2 105 120 5250 2295 X\00116 6 4275 1650 4575 1950 17 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4425 1800 105 105 4425 1800 4530 1905 18 4 1 -1 0 0 0 10 0.0000 2 105 90 4425 1860 b\001 19 19 -6 20 6 5100 1800 5400 2100 21 1 3 0 1 -1 -1 1 0 4 0.000 1 0.0000 5250 1950 105 105 5250 1950 5355 1950 22 4 1 -1 0 0 0 10 0.0000 2 105 120 5250 2010 Y\001 20 6 1495 5445 5700 5655 21 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 1575 5550 80 80 1575 5550 1655 5630 22 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 2925 5550 105 105 2925 5550 3030 5655 23 1 3 0 1 -1 -1 0 0 4 0.000 1 0.0000 4425 5550 105 105 4425 5550 4530 5655 24 4 0 -1 0 0 0 12 0.0000 2 135 1035 3150 5625 blocked task\001 25 4 0 -1 0 0 0 12 0.0000 2 135 870 1725 5625 active task\001 26 4 0 -1 0 0 0 12 0.0000 2 135 1050 4650 5625 routine mask\001 23 27 -6 24 6 5850 1650 6150 1950 25 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6000 1800 105 105 6000 1800 6105 1905 26 4 1 -1 0 0 0 10 0.0000 2 105 90 6000 1860 b\001 28 6 3525 1800 3825 2400 29 1 3 0 1 -1 -1 1 0 4 0.000 1 0.0000 3675 1950 105 105 3675 1950 3780 1950 30 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 31 3525 1800 3825 1800 3825 2400 3525 2400 3525 1800 32 4 1 4 0 0 0 10 0.0000 2 105 120 3675 2010 Y\001 27 33 -6 28 6 3070 5445 7275 5655 29 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3150 5550 80 80 3150 5550 3230 5630 30 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4500 5550 105 105 4500 5550 4605 5655 31 1 3 0 1 -1 -1 0 0 4 0.000 1 0.0000 6000 5550 105 105 6000 5550 6105 5655 32 4 0 -1 0 0 0 12 0.0000 2 135 1035 4725 5625 blocked task\001 33 4 0 -1 0 0 0 12 0.0000 2 135 870 3300 5625 active task\001 34 4 0 -1 0 0 0 12 0.0000 2 135 1050 6225 5625 routine mask\001 34 6 3525 2100 3825 2400 35 1 3 0 1 -1 -1 1 0 4 0.000 1 0.0000 3675 2250 105 105 3675 2250 3780 2250 36 4 1 4 0 0 0 10 0.0000 2 105 120 3675 2295 X\001 35 37 -6 36 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3300 3600 105 105 3300 3600 3405370537 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3600 3600 105 105 3600 3600 3705370538 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6600 3900 105 105 6600 3900 6705400539 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6900 3900 105 105 6900 3900 7005400540 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6000 2700 105 105 6000 2700 6105280541 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6000 2400 105 105 6000 2400 6105250542 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 5100 4575 80 80 5100 4575 5180465538 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 1725 3600 105 105 1725 3600 1830 3705 39 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 2025 3600 105 105 2025 3600 2130 3705 40 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5025 3900 105 105 5025 3900 5130 4005 41 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 5325 3900 105 105 5325 3900 5430 4005 42 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4425 2700 105 105 4425 2700 4530 2805 43 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4425 2400 105 105 4425 2400 4530 2505 44 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3525 4575 80 80 3525 4575 3605 4655 43 45 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 44 4050 2925 5475 2925 5475 3225 4050 3225 4050292546 2475 2925 3900 2925 3900 3225 2475 3225 2475 2925 45 47 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 4 46 3150 3750 3750 3750 3750 4050 3150405048 1575 3750 2175 3750 2175 4050 1575 4050 47 49 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 3 48 3150 3450 3750 3450 3900367550 1575 3450 2175 3450 2325 3675 49 51 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2 50 3750 3150 3600337552 2175 3150 2025 3375 51 53 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 3 52 3150 4350 3750 4350 3900457554 1575 4350 2175 4350 2325 4575 53 55 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2 54 3750 4050 3600427556 2175 4050 2025 4275 55 57 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 4 56 3150 4650 3750 4650 3750 4950 4950495058 1575 4650 2175 4650 2175 4950 3375 4950 57 59 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2 58 6450 3750 6300397560 4875 3750 4725 3975 59 61 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2 60 4950 4950 5175510062 3375 4950 3600 5100 61 63 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 9 62 5250 4950 6450 4950 6450 4050 7050 4050 7050 3750 6450375063 6450 2850 6150 2850 6150165064 3675 4950 4875 4950 4875 4050 5475 4050 5475 3750 4875 3750 65 4875 2850 4575 2850 4575 1650 64 66 2 2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5 65 5850 4200 5850 3300 4350 3300 4350 4200 5850420066 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1267 4275 4200 4275 3300 2775 3300 2775 4200 4275 4200 68 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 2 67 69 1 1 1.00 60.00 120.00 68 7 1 1.00 60.00 120.00 69 5250 3150 5250 2400 70 3675 3075 3675 2400 71 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 72 4125 2850 4575 3000 70 73 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 71 3150 3150 3750 3150 3750 2850 5700 2850 5700 1650 72 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 73 5700 2850 6150 3000 74 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 75 5100 1800 5400 1800 5400 2400 5100 2400 5100 1800 76 4 1 -1 0 0 0 10 0.0000 2 75 75 6000 2745 a\001 77 4 1 -1 0 0 0 10 0.0000 2 75 75 6000 2445 c\001 78 4 1 -1 0 0 0 12 0.0000 2 135 315 5100 5325 exit\001 79 4 1 -1 0 0 0 12 0.0000 2 135 135 3300 3075 A\001 80 4 1 -1 0 0 0 12 0.0000 2 135 795 3300 4875 condition\001 81 4 1 -1 0 0 0 12 0.0000 2 135 135 3300 5100 B\001 82 4 0 -1 0 0 0 12 0.0000 2 135 420 6600 3675 stack\001 83 4 0 -1 0 0 0 12 0.0000 2 180 750 6600 3225 acceptor/\001 84 4 0 -1 0 0 0 12 0.0000 2 180 750 6600 3450 signalled\001 85 4 1 -1 0 0 0 12 0.0000 2 135 795 3300 2850 condition\001 86 4 1 -1 0 0 0 12 0.0000 2 165 420 6000 1350 entry\001 87 4 1 -1 0 0 0 12 0.0000 2 135 495 6000 1575 queue\001 88 4 0 -1 0 0 0 12 0.0000 2 135 525 6300 2400 arrival\001 89 4 0 -1 0 0 0 12 0.0000 2 135 630 6300 2175 order of\001 90 4 1 -1 0 0 0 12 0.0000 2 135 525 5100 3675 shared\001 91 4 1 -1 0 0 0 12 0.0000 2 135 735 5100 3975 variables\001 92 4 0 0 50 -1 0 11 0.0000 2 165 855 4275 3150 Acceptables\001 93 4 0 0 50 -1 0 11 0.0000 2 120 165 5775 2700 W\001 94 4 0 0 50 -1 0 11 0.0000 2 120 135 5775 2400 X\001 95 4 0 0 50 -1 0 11 0.0000 2 120 105 5775 2100 Z\001 96 4 0 0 50 -1 0 11 0.0000 2 120 135 5775 1800 Y\001 74 1575 3150 2175 3150 2175 2850 4125 2850 4125 1650 75 4 1 -1 0 0 0 10 0.0000 2 75 75 4425 2745 a\001 76 4 1 -1 0 0 0 10 0.0000 2 75 75 4425 2445 c\001 77 4 1 -1 0 0 0 12 0.0000 2 135 315 3525 5325 exit\001 78 4 1 -1 0 0 0 12 0.0000 2 135 135 1725 3075 A\001 79 4 1 -1 0 0 0 12 0.0000 2 135 795 1725 4875 condition\001 80 4 1 -1 0 0 0 12 0.0000 2 135 135 1725 5100 B\001 81 4 0 -1 0 0 0 12 0.0000 2 135 420 5025 3675 stack\001 82 4 0 -1 0 0 0 12 0.0000 2 180 750 5025 3225 acceptor/\001 83 4 0 -1 0 0 0 12 0.0000 2 180 750 5025 3450 signalled\001 84 4 1 -1 0 0 0 12 0.0000 2 135 795 1725 2850 condition\001 85 4 1 -1 0 0 0 12 0.0000 2 165 420 4425 1350 entry\001 86 4 1 -1 0 0 0 12 0.0000 2 135 495 4425 1575 queue\001 87 4 0 -1 0 0 0 12 0.0000 2 135 525 4725 2400 arrival\001 88 4 0 -1 0 0 0 12 0.0000 2 135 630 4725 2175 order of\001 89 4 1 -1 0 0 0 12 0.0000 2 135 525 3525 3675 shared\001 90 4 1 -1 0 0 0 12 0.0000 2 135 735 3525 3975 variables\001 91 4 0 4 50 -1 0 11 0.0000 2 120 135 4150 1875 Y\001 92 4 0 4 50 -1 0 11 0.0000 2 120 105 4150 2175 Z\001 93 4 0 4 50 -1 0 11 0.0000 2 120 135 4150 2475 X\001 94 4 0 4 50 -1 0 11 0.0000 2 120 165 4150 2775 W\001 95 4 0 -1 0 0 3 12 0.0000 2 150 540 5025 4275 urgent\001 96 4 1 0 50 -1 0 11 0.0000 2 165 600 3150 3150 accepted\001 -
doc/papers/concurrency/figures/monitor.fig
ra12c81f3 rc653b37 72 72 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 4 73 73 3600 1500 3600 2100 4200 2100 4200 900 74 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 475 2700 1500 2700 2100 3300 2100 3300 150076 74 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 9 77 75 3600 4200 4800 4200 4800 3300 5400 3300 5400 3000 4800 3000 … … 79 77 2 2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5 80 78 4200 3450 4200 2550 2700 2550 2700 3450 4200 3450 79 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 4 80 2700 1500 2700 2100 3300 2100 3300 1500 81 81 4 1 -1 0 0 0 10 0.0000 2 75 75 4350 1995 a\001 82 82 4 1 -1 0 0 0 10 0.0000 2 75 75 4350 1695 c\001 … … 89 89 4 0 -1 0 0 0 12 0.0000 2 180 750 4950 2700 signalled\001 90 90 4 1 -1 0 0 0 12 0.0000 2 135 795 1650 2100 condition\001 91 4 1 -10 0 0 12 0.0000 2 135 135 2550 1425 X\00192 4 1 -10 0 0 12 0.0000 2 135 135 3450 1425 Y\00191 4 1 4 0 0 0 12 0.0000 2 135 135 2550 1425 X\001 92 4 1 4 0 0 0 12 0.0000 2 135 135 3450 1425 Y\001 93 93 4 1 -1 0 0 0 12 0.0000 2 165 420 4350 600 entry\001 94 94 4 1 -1 0 0 0 12 0.0000 2 135 495 4350 825 queue\001 … … 100 100 4 1 -1 0 0 0 10 0.0000 2 75 75 3450 1995 c\001 101 101 4 1 -1 0 0 0 12 0.0000 2 135 570 3000 1200 queues\001 102 4 0 -1 0 0 3 12 0.0000 2 150 540 4950 3525 urgent\001 -
doc/papers/general/Makefile
ra12c81f3 rc653b37 59 59 dvips ${Build}/$< -o $@ 60 60 61 ${BASE}.dvi : Makefile ${B uild} ${BASE}.out.ps WileyNJD-AMA.bst ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \62 ../../bibliography/pl.bib 61 ${BASE}.dvi : Makefile ${BASE}.out.ps WileyNJD-AMA.bst ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 62 ../../bibliography/pl.bib | ${Build} 63 63 # Must have *.aux file containing citations for bibtex 64 64 if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi … … 75 75 mkdir -p ${Build} 76 76 77 ${BASE}.out.ps : ${Build}77 ${BASE}.out.ps : | ${Build} 78 78 ln -fs ${Build}/Paper.out.ps . 79 79 … … 84 84 gnuplot -e Build="'${Build}/'" evaluation/timing.gp 85 85 86 %.tex : %.fig ${Build}86 %.tex : %.fig | ${Build} 87 87 fig2dev -L eepic $< > ${Build}/$@ 88 88 89 %.ps : %.fig ${Build}89 %.ps : %.fig | ${Build} 90 90 fig2dev -L ps $< > ${Build}/$@ 91 91 92 %.pstex : %.fig ${Build}92 %.pstex : %.fig | ${Build} 93 93 fig2dev -L pstex $< > ${Build}/$@ 94 94 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t -
doc/proposals/ctordtor/Makefile
ra12c81f3 rc653b37 1 ## Define the appropriateconfiguration variables.1 ## Define the configuration variables. 2 2 3 MACROS = ../../LaTeXmacros 4 BIB = ../../bibliography 3 Build = build 4 Figures = figures 5 Macros = ../../LaTeXmacros 6 Bib = ../../bibliography 5 7 6 TeXLIB = .:$ (MACROS):$(MACROS)/listings:$(MACROS)/enumitem:$(BIB)/:7 LaTeX = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error 8 TeXLIB = .:${Macros}:${MACROS}/listings:${MACROS}/enumitem:${Bib}/: 9 LaTeX = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error -output-directory=${Build} 8 10 BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex 11 12 MAKEFLAGS = --no-print-directory # --silent 13 VPATH = ${Build} ${Figures} 9 14 10 15 ## Define the text source files. … … 29 34 30 35 DOCUMENT = ctor.pdf 36 BASE = ${basename ${DOCUMENT}} 31 37 32 38 # Directives # 39 40 .PHONY : all clean # not file names 33 41 34 42 all : ${DOCUMENT} 35 43 36 44 clean : 37 rm -f *.bbl *.aux *.dvi *.idx *.ilg *.ind *.brf *.out *.log *.toc *.blg *.pstex_t *.cf \ 38 ${FIGURES} ${PICTURES} ${PROGRAMS} ${GRAPHS} ${basename ${DOCUMENT}}.ps ${DOCUMENT} 45 @rm -frv ${DOCUMENT} ${BASE}.ps ${Build} 39 46 40 47 # File Dependencies # 41 48 42 ${DOCUMENT} : ${ basename ${DOCUMENT}}.ps49 ${DOCUMENT} : ${BASE}.ps 43 50 ps2pdf $< 44 51 45 ${ basename ${DOCUMENT}}.ps : ${basename ${DOCUMENT}}.dvi46 dvips $ < -o $@52 ${BASE}.ps : ${BASE}.dvi 53 dvips ${Build}/$< -o $@ 47 54 48 ${ basename ${DOCUMENT}}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} ${basename ${DOCUMENT}}.tex\49 $ (MACROS)/common.tex $(MACROS)/indexstyle $(BIB)/cfa.bib55 ${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 56 ${Macros}/common.tex ${Macros}/indexstyle ${Bib}/pl.bib | ${Build} 50 57 # Conditionally create an empty *.ind (index) file for inclusion until makeindex is run. 51 if [ ! -r ${basename $@}.ind ] ; then touch${basename $@}.ind ; fi58 #if [ ! -r ${basename $@}.ind ] ; then touch ${Build}/${basename $@}.ind ; fi 52 59 # Must have *.aux file containing citations for bibtex 53 60 if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi 54 -${BibTeX} ${ basename $@}55 # Some citations reference others so run stepsagain to resolve these citations61 -${BibTeX} ${Build}/${basename $@} 62 # Some citations reference others so run again to resolve these citations 56 63 ${LaTeX} ${basename $@}.tex 57 -${BibTeX} ${ basename $@}64 -${BibTeX} ${Build}/${basename $@} 58 65 # Make index from *.aux entries and input index at end of document 59 makeindex -s $(MACROS)/indexstyle ${basename $@}.idx 66 #makeindex -s ${Macros}/indexstyle ${Build}/${basename $@}.idx 67 # Run again to finish citations 60 68 ${LaTeX} ${basename $@}.tex 61 69 # Run again to get index title into table of contents … … 67 75 ## Define the default recipes. 68 76 69 %.tex : %.fig 70 fig2dev -L eepic $< > $@77 ${Build}: 78 mkdir -p ${Build} 71 79 72 %. ps : %.fig73 fig2dev -L ps $< >$@80 %.tex : %.fig | ${Build} 81 fig2dev -L eepic $< > ${Build}/$@ 74 82 75 %.pstex : %.fig 76 fig2dev -L pstex $< > $@ 77 fig2dev -L pstex_t -p $@ $< > $@_t 83 %.ps : %.fig | ${Build} 84 fig2dev -L ps $< > ${Build}/$@ 85 86 %.pstex : %.fig | ${Build} 87 fig2dev -L pstex $< > ${Build}/$@ 88 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t 78 89 79 90 # Local Variables: # -
doc/proposals/ctordtor/ctor.tex
ra12c81f3 rc653b37 1 % inline code ©...© (copyright symbol) emacs: C-q M-)2 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-.3 % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_4 % green highlighting ¢...¢ (cent symbol) emacs: C-q M-"5 % LaTex escape §...§ (section symbol) emacs: C-q M-'6 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^7 % math escape $...$ (dollar symbol)8 9 1 \documentclass[twoside,11pt]{article} 10 2 … … 15 7 \usepackage{textcomp} 16 8 \usepackage[latin1]{inputenc} 9 17 10 \usepackage{fullpage,times,comment} 18 11 \usepackage{epic,eepic} 19 \usepackage{upquote} 12 \usepackage{upquote} % switch curled `'" to straight 20 13 \usepackage{calc} 21 14 \usepackage{xspace} 22 15 \usepackage{graphicx} 23 \usepackage{varioref} 24 \usepackage{listings} 25 \usepackage[flushmargin]{footmisc} 16 \usepackage{varioref} % extended references 17 \usepackage{listings} % format program code 18 \usepackage[flushmargin]{footmisc} % support label/reference in footnote 26 19 \usepackage{latexsym} % \Box glyph 27 20 \usepackage{mathptmx} % better math font with "times" … … 34 27 \renewcommand{\UrlFont}{\small\sf} 35 28 36 \setlength{\topmargin}{-0.45in} 29 \setlength{\topmargin}{-0.45in} % move running title into header 37 30 \setlength{\headsep}{0.25in} 38 31 … … 43 36 44 37 \interfootnotelinepenalty=10000 38 39 \CFAStyle % use default CFA format-style 40 % inline code ©...© (copyright symbol) emacs: C-q M-) 41 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-. 42 % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_ 43 % green highlighting ¢...¢ (cent symbol) emacs: C-q M-" 44 % LaTex escape §...§ (section symbol) emacs: C-q M-' 45 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^ 46 % math escape $...$ (dollar symbol) 47 45 48 46 49 \title{ … … 83 86 \thispagestyle{plain} 84 87 \pagenumbering{arabic} 85 86 88 87 89 -
doc/proposals/tuples/Makefile
ra12c81f3 rc653b37 1 ## Define the appropriateconfiguration variables.1 ## Define the configuration variables. 2 2 3 MACROS = ../../LaTeXmacros 4 BIB = ../../bibliography 3 Build = build 4 Figures = figures 5 Macros = ../../LaTeXmacros 6 Bib = ../../bibliography 5 7 6 TeXLIB = .:$ (MACROS):$(MACROS)/listings:$(MACROS)/enumitem:$(BIB)/:7 LaTeX = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error 8 TeXLIB = .:${Macros}:${MACROS}/listings:${MACROS}/enumitem:${Bib}/: 9 LaTeX = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error -output-directory=${Build} 8 10 BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex 11 12 MAKEFLAGS = --no-print-directory --silent # 13 VPATH = ${Build} ${Figures} 9 14 10 15 ## Define the text source files. … … 29 34 30 35 DOCUMENT = tuples.pdf 36 BASE = ${basename ${DOCUMENT}} 31 37 32 38 # Directives # 39 40 .PHONY : all clean # not file names 33 41 34 42 all : ${DOCUMENT} 35 43 36 44 clean : 37 rm -f *.bbl *.aux *.dvi *.idx *.ilg *.ind *.brf *.out *.log *.toc *.blg *.pstex_t *.cf \ 38 ${FIGURES} ${PICTURES} ${PROGRAMS} ${GRAPHS} ${basename ${DOCUMENT}}.ps ${DOCUMENT} 45 @rm -frv ${DOCUMENT} ${BASE}.ps ${Build} 39 46 40 47 # File Dependencies # 41 48 42 ${DOCUMENT} : ${ basename ${DOCUMENT}}.ps49 ${DOCUMENT} : ${BASE}.ps 43 50 ps2pdf $< 44 51 45 ${ basename ${DOCUMENT}}.ps : ${basename ${DOCUMENT}}.dvi46 dvips $ < -o $@52 ${BASE}.ps : ${BASE}.dvi 53 dvips ${Build}/$< -o $@ 47 54 48 ${ basename ${DOCUMENT}}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} ${basename ${DOCUMENT}}.tex\49 $ (MACROS)/common.tex $(MACROS)/indexstyle $(BIB)/cfa.bib55 ${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 56 ${Macros}/common.tex ${Macros}/indexstyle ${Bib}/pl.bib | ${Build} 50 57 # Conditionally create an empty *.ind (index) file for inclusion until makeindex is run. 51 if [ ! -r ${basename $@}.ind ] ; then touch${basename $@}.ind ; fi58 #if [ ! -r ${basename $@}.ind ] ; then touch ${Build}/${basename $@}.ind ; fi 52 59 # Must have *.aux file containing citations for bibtex 53 60 if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi 54 -${BibTeX} ${ basename $@}55 # Some citations reference others so run stepsagain to resolve these citations61 -${BibTeX} ${Build}/${basename $@} 62 # Some citations reference others so run again to resolve these citations 56 63 ${LaTeX} ${basename $@}.tex 57 -${BibTeX} ${ basename $@}64 -${BibTeX} ${Build}/${basename $@} 58 65 # Make index from *.aux entries and input index at end of document 59 makeindex -s $(MACROS)/indexstyle ${basename $@}.idx 66 #makeindex -s ${Macros}/indexstyle ${Build}/${basename $@}.idx 67 # Run again to finish citations 60 68 ${LaTeX} ${basename $@}.tex 61 69 # Run again to get index title into table of contents … … 67 75 ## Define the default recipes. 68 76 69 %.tex : %.fig 70 fig2dev -L eepic $< > $@77 ${Build}: 78 mkdir -p ${Build} 71 79 72 %. ps : %.fig73 fig2dev -L ps $< >$@80 %.tex : %.fig | ${Build} 81 fig2dev -L eepic $< > ${Build}/$@ 74 82 75 %.pstex : %.fig 76 fig2dev -L pstex $< > $@ 77 fig2dev -L pstex_t -p $@ $< > $@_t 83 %.ps : %.fig | ${Build} 84 fig2dev -L ps $< > ${Build}/$@ 85 86 %.pstex : %.fig | ${Build} 87 fig2dev -L pstex $< > ${Build}/$@ 88 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t 78 89 79 90 # Local Variables: # -
doc/proposals/tuples/tuples.tex
ra12c81f3 rc653b37 1 % inline code ©...© (copyright symbol) emacs: C-q M-)2 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-.3 % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_4 % green highlighting ¢...¢ (cent symbol) emacs: C-q M-"5 % LaTex escape §...§ (section symbol) emacs: C-q M-'6 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^7 % math escape $...$ (dollar symbol)8 9 1 \documentclass[twoside,11pt]{article} 10 2 … … 15 7 \usepackage{textcomp} 16 8 \usepackage[latin1]{inputenc} 9 17 10 \usepackage{fullpage,times,comment} 18 11 \usepackage{epic,eepic} 19 \usepackage{upquote} 12 \usepackage{upquote} % switch curled `'" to straight 20 13 \usepackage{calc} 21 14 \usepackage{xspace} 22 15 \usepackage{graphicx} 23 \usepackage{varioref} 24 \usepackage{listings} 25 \usepackage[flushmargin]{footmisc} 16 \usepackage{varioref} % extended references 17 \usepackage{listings} % format program code 18 \usepackage[flushmargin]{footmisc} % support label/reference in footnote 26 19 \usepackage{latexsym} % \Box glyph 27 20 \usepackage{mathptmx} % better math font with "times" … … 34 27 \renewcommand{\UrlFont}{\small\sf} 35 28 36 \setlength{\topmargin}{-0.45in} 29 \setlength{\topmargin}{-0.45in} % move running title into header 37 30 \setlength{\headsep}{0.25in} 38 31 … … 42 35 43 36 \interfootnotelinepenalty=10000 37 38 \CFAStyle % use default CFA format-style 39 % inline code ©...© (copyright symbol) emacs: C-q M-) 40 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-. 41 % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_ 42 % green highlighting ¢...¢ (cent symbol) emacs: C-q M-" 43 % LaTex escape §...§ (section symbol) emacs: C-q M-' 44 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^ 45 % math escape $...$ (dollar symbol) 46 44 47 45 48 \title{ -
doc/refrat/Makefile
ra12c81f3 rc653b37 53 53 dvips ${Build}/$< -o $@ 54 54 55 ${BASE}.dvi : Makefile ${ Build} ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \56 ${Macros}/common.tex ${Macros}/lstlang.sty ${Macros}/indexstyle ../bibliography/pl.bib 55 ${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 56 ${Macros}/common.tex ${Macros}/lstlang.sty ${Macros}/indexstyle ../bibliography/pl.bib | ${Build} 57 57 # Conditionally create an empty *.ind (index) file for inclusion until makeindex is run. 58 58 if [ ! -r ${basename $@}.ind ] ; then touch ${Build}/${basename $@}.ind ; fi … … 78 78 mkdir -p ${Build} 79 79 80 %.tex : %.fig ${Build}80 %.tex : %.fig | ${Build} 81 81 fig2dev -L eepic $< > ${Build}/$@ 82 82 83 %.ps : %.fig ${Build}83 %.ps : %.fig | ${Build} 84 84 fig2dev -L ps $< > ${Build}/$@ 85 85 86 %.pstex : %.fig ${Build}86 %.pstex : %.fig | ${Build} 87 87 fig2dev -L pstex $< > ${Build}/$@ 88 88 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t -
doc/theses/aaron_moss/comp_II/Makefile
ra12c81f3 rc653b37 32 32 33 33 DOCUMENT = comp_II.pdf 34 BASE = ${basename ${DOCUMENT}} 34 35 35 36 # Directives # … … 40 41 41 42 clean : 42 @rm -frv ${DOCUMENT} ${ basename ${DOCUMENT}}.ps ${Build}43 @rm -frv ${DOCUMENT} ${BASE}.ps ${Build} 43 44 44 45 # File Dependencies # 45 46 46 ${DOCUMENT} : ${ basename ${DOCUMENT}}.ps47 ${DOCUMENT} : ${BASE}.ps 47 48 ps2pdf $< 48 49 49 ${ basename ${DOCUMENT}}.ps : ${basename ${DOCUMENT}}.dvi50 ${BASE}.ps : ${BASE}.dvi 50 51 dvips ${Build}/$< -o $@ 51 52 52 ${ basename ${DOCUMENT}}.dvi : Makefile ${Build}${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \53 ${Macros}/common.tex ${Macros}/indexstyle ../../../bibliography/pl.bib 53 ${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 54 ${Macros}/common.tex ${Macros}/indexstyle ../../../bibliography/pl.bib | ${Build} 54 55 # Must have *.aux file containing citations for bibtex 55 56 if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi … … 66 67 mkdir -p ${Build} 67 68 68 %.tex : %.fig 69 %.tex : %.fig ${Build} 69 70 fig2dev -L eepic $< > ${Build}/$@ 70 71 71 %.ps : %.fig 72 %.ps : %.fig | ${Build} 72 73 fig2dev -L ps $< > ${Build}/$@ 73 74 74 %.pstex : %.fig 75 %.pstex : %.fig | ${Build} 75 76 fig2dev -L pstex $< > ${Build}/$@ 76 77 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t -
doc/theses/thierry_delisle/Makefile
ra12c81f3 rc653b37 51 51 52 52 DOCUMENT = thesis.pdf 53 BASE = ${basename ${DOCUMENT}} 53 54 54 55 # Directives # … … 59 60 60 61 clean : 61 @rm -frv ${DOCUMENT} ${ basename ${DOCUMENT}}.ps ${Build}62 @rm -frv ${DOCUMENT} ${BASE}.ps ${Build} 62 63 63 64 # File Dependencies # 64 65 65 ${DOCUMENT} : ${ basename ${DOCUMENT}}.ps66 ${DOCUMENT} : ${BASE}.ps 66 67 ps2pdf $< 67 68 68 ${ basename ${DOCUMENT}}.ps : ${basename ${DOCUMENT}}.dvi69 ${BASE}.ps : ${BASE}.dvi 69 70 dvips ${Build}/$< -o $@ 70 71 71 ${ basename ${DOCUMENT}}.dvi : Makefile ${Build}${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \72 ${Macros}/common.tex ${Macros}/indexstyle annex/local.bib ../../bibliography/pl.bib 72 ${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 73 ${Macros}/common.tex ${Macros}/indexstyle annex/local.bib ../../bibliography/pl.bib | ${Build} 73 74 # Must have *.aux file containing citations for bibtex 74 75 if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi … … 91 92 fig2dev -L eepic $< > ${Build}/$@ 92 93 93 %.ps : %.fig ${Build}94 %.ps : %.fig | ${Build} 94 95 fig2dev -L ps $< > ${Build}/$@ 95 96 96 %.pstex : %.fig ${Build}97 %.pstex : %.fig | ${Build} 97 98 fig2dev -L pstex $< > ${Build}/$@ 98 99 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t -
doc/user/Makefile
ra12c81f3 rc653b37 57 57 dvips ${Build}/$< -o $@ 58 58 59 ${BASE}.dvi : Makefile ${ Build} ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \60 ${Macros}/common.tex ${Macros}/lstlang.sty ${Macros}/indexstyle ../bibliography/pl.bib 59 ${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \ 60 ${Macros}/common.tex ${Macros}/lstlang.sty ${Macros}/indexstyle ../bibliography/pl.bib | ${Build} 61 61 # Conditionally create an empty *.ind (index) file for inclusion until makeindex is run. 62 62 if [ ! -r ${basename $@}.ind ] ; then touch ${Build}/${basename $@}.ind ; fi … … 79 79 mkdir -p ${Build} 80 80 81 %.tex : %.fig ${Build}81 %.tex : %.fig | ${Build} 82 82 fig2dev -L eepic $< > ${Build}/$@ 83 83 84 %.ps : %.fig ${Build}84 %.ps : %.fig | ${Build} 85 85 fig2dev -L ps $< > ${Build}/$@ 86 86 87 %.pstex : %.fig ${Build}87 %.pstex : %.fig | ${Build} 88 88 fig2dev -L pstex $< > ${Build}/$@ 89 89 fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t -
src/Parser/TypedefTable.cc
ra12c81f3 rc653b37 10 10 // Created On : Sat May 16 15:20:13 2015 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Thu Jun 7 13:17:56201813 // Update Count : 19212 // Last Modified On : Fri Jun 22 06:14:39 2018 13 // Update Count : 206 14 14 // 15 15 … … 78 78 debugPrint( cerr << "Adding current at " << locn << " " << identifier << " as " << kindName( kind ) << " scope " << scope << endl ); 79 79 auto ret = kindTable.insertAt( scope, identifier, kind ); 80 if ( ! ret.second ) ret.first->second = kind; // exists => update 80 //if ( ! ret.second ) ret.first->second = kind; // exists => update 81 assert( ret.first->second == kind ); // exists 81 82 } // TypedefTable::addToScope 82 83 83 84 void TypedefTable::addToEnclosingScope( const string & identifier, int kind, const char * locn __attribute__((unused)) ) { 84 assert( kindTable.currentScope() >= 1 );85 auto scope = kindTable.currentScope() - 1 ;86 debugPrint( cerr << "Adding enclosing at " << locn << " " << identifier << " as " << kindName( kind ) << " scope " << scope << endl );85 assert( kindTable.currentScope() >= 1 + level ); 86 auto scope = kindTable.currentScope() - 1 - level; 87 debugPrint( cerr << "Adding enclosing at " << locn << " " << identifier << " as " << kindName( kind ) << " scope " << scope << " level " << level << endl ); 87 88 auto ret = kindTable.insertAt( scope, identifier, kind ); 88 89 if ( ! ret.second ) ret.first->second = kind; // exists => update … … 91 92 void TypedefTable::enterScope() { 92 93 kindTable.beginScope(); 93 debugPrint( cerr << "Entering scope " << kindTable.currentScope() << endl ); 94 debugPrint( print() ); 94 debugPrint( cerr << "Entering scope " << kindTable.currentScope() << endl; print() ); 95 95 } // TypedefTable::enterScope 96 96 97 97 void TypedefTable::leaveScope() { 98 debugPrint( cerr << "Leaving scope " << kindTable.currentScope() << endl ); 99 debugPrint( print() ); 98 debugPrint( cerr << "Leaving scope " << kindTable.currentScope() << endl; print() ); 100 99 kindTable.endScope(); 101 100 } // TypedefTable::leaveScope … … 114 113 --scope; 115 114 debugPrint( cerr << endl << "[" << scope << "]" ); 116 } 115 } // while 117 116 debugPrint( cerr << endl ); 118 } 117 } // TypedefTable::print 119 118 120 119 // Local Variables: // -
src/Parser/TypedefTable.h
ra12c81f3 rc653b37 10 10 // Created On : Sat May 16 15:24:36 2015 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Thu Jun 7 12:10:17201813 // Update Count : 8 512 // Last Modified On : Fri Jun 22 05:29:58 2018 13 // Update Count : 86 14 14 // 15 15 … … 25 25 typedef ScopedMap< std::string, int > KindTable; 26 26 KindTable kindTable; 27 unsigned int level = 0; 27 28 public: 28 29 ~TypedefTable(); … … 37 38 void leaveScope(); 38 39 40 void up() { level += 1; } 41 void down() { level -= 1; } 42 39 43 void print( void ) const; 40 44 }; // TypedefTable -
src/Parser/lex.ll
ra12c81f3 rc653b37 10 10 * Created On : Sat Sep 22 08:58:10 2001 11 11 * Last Modified By : Peter A. Buhr 12 * Last Modified On : Thu Jun 7 08:27:40201813 * Update Count : 6 7912 * Last Modified On : Wed Jun 20 09:08:28 2018 13 * Update Count : 682 14 14 */ 15 15 … … 25 25 //**************************** Includes and Defines **************************** 26 26 27 // trigger before each matching rule's action 28 #define YY_USER_ACTION \ 29 yylloc.first_line = yylineno; \ 30 yylloc.first_column = column; \ 31 column += yyleng; \ 32 yylloc.last_column = column; \ 33 yylloc.last_line = yylineno; \ 34 yylloc.filename = yyfilename ? yyfilename : ""; 27 35 unsigned int column = 0; // position of the end of the last token parsed 28 #define YY_USER_ACTION yylloc.first_line = yylineno; yylloc.first_column = column; column += yyleng; yylloc.last_column = column; yylloc.last_line = yylineno; yylloc.filename = yyfilename ? yyfilename : ""; // trigger before each matching rule's action29 36 30 37 #include <string> … … 49 56 #define NUMERIC_RETURN(x) rm_underscore(); RETURN_VAL( x ) // numeric constant 50 57 #define KEYWORD_RETURN(x) RETURN_CHAR( x ) // keyword 51 #define QKEYWORD_RETURN(x) typedefTable.isKind( yytext ); RETURN_VAL(x);// quasi-keyword58 #define QKEYWORD_RETURN(x) RETURN_VAL(x); // quasi-keyword 52 59 #define IDENTIFIER_RETURN() RETURN_VAL( typedefTable.isKind( yytext ) ) 53 60 #define ATTRIBUTE_RETURN() RETURN_VAL( ATTR_IDENTIFIER ) -
src/Parser/parser.yy
ra12c81f3 rc653b37 10 10 // Created On : Sat Sep 1 20:22:55 2001 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Thu Jun 7 10:07:12201813 // Update Count : 35 2712 // Last Modified On : Sun Jun 24 10:41:10 2018 13 // Update Count : 3587 14 14 // 15 15 … … 136 136 } // build_postfix_name 137 137 138 bool forall = false, xxx = false ;// aggregate have one or more forall qualifiers ?138 bool forall = false, xxx = false, yyy = false; // aggregate have one or more forall qualifiers ? 139 139 140 140 // https://www.gnu.org/software/bison/manual/bison.html#Location-Type … … 304 304 %type<en> enumerator_value_opt 305 305 306 %type<decl> exception_declaration external_definition external_definition_list external_definition_list_no_pop_push external_definition_list_opt 306 %type<decl> external_definition external_definition_list external_definition_list_opt 307 308 %type<decl> exception_declaration 307 309 308 310 %type<decl> field_declaration field_declaration_list_opt field_declarator_opt field_declaring_list … … 503 505 { $$ = new ExpressionNode( build_func( new ExpressionNode( build_postfix_name( $5 ) ), $2 ) ); } 504 506 | type_name '.' no_attr_identifier // CFA, nested type 505 // { SemanticError( yylloc, "Qualified name is currently unimplemented." ); $$ = nullptr; } 506 { $$ = nullptr; } 507 { SemanticError( yylloc, "Qualified name is currently unimplemented." ); $$ = nullptr; } 507 508 | type_name '.' '[' field_list ']' // CFA, nested type / tuple field selector 508 // { SemanticError( yylloc, "Qualified name is currently unimplemented." ); $$ = nullptr; } 509 { $$ = nullptr; } 509 { SemanticError( yylloc, "Qualified name is currently unimplemented." ); $$ = nullptr; } 510 510 | GENERIC '(' assignment_expression ',' generic_assoc_list ')' // C11 511 511 { … … 1821 1821 ; 1822 1822 1823 fred: 1824 // empty 1825 { yyy = false; } 1826 ; 1827 1823 1828 aggregate_type: // struct, union 1824 1829 aggregate_key attribute_list_opt '{' field_declaration_list_opt '}' 1825 1830 { $$ = DeclarationNode::newAggregate( $1, new string( DeclarationNode::anonymous.newName() ), nullptr, $4, true )->addQualifiers( $2 ); } 1826 | aggregate_key attribute_list_opt no_attr_identifier 1831 | aggregate_key attribute_list_opt no_attr_identifier fred 1827 1832 { 1828 1833 typedefTable.makeTypedef( *$3, forall ? TYPEGENname : TYPEDEFname ); // create typedef 1829 //if ( forall ) typedefTable.changeKind( *$3, TYPEGENname ); // possibly update1830 1834 forall = false; // reset 1831 1835 } 1832 1836 '{' field_declaration_list_opt '}' 1833 { $$ = DeclarationNode::newAggregate( $1, $3, nullptr, $ 6, true )->addQualifiers( $2 ); }1834 | aggregate_key attribute_list_opt type_name 1837 { $$ = DeclarationNode::newAggregate( $1, $3, nullptr, $7, true )->addQualifiers( $2 ); } 1838 | aggregate_key attribute_list_opt type_name fred 1835 1839 { 1836 1840 typedefTable.makeTypedef( *$3->type->symbolic.name, forall ? TYPEGENname : TYPEDEFname ); // create typedef 1837 //if ( forall ) typedefTable.changeKind( *$3->type->symbolic.name, TYPEGENname ); // possibly update1838 1841 forall = false; // reset 1839 1842 } 1840 1843 '{' field_declaration_list_opt '}' 1841 { $$ = DeclarationNode::newAggregate( $1, $3->type->symbolic.name, nullptr, $ 6, true )->addQualifiers( $2 ); }1844 { $$ = DeclarationNode::newAggregate( $1, $3->type->symbolic.name, nullptr, $7, true )->addQualifiers( $2 ); } 1842 1845 | aggregate_key attribute_list_opt '(' type_list ')' '{' field_declaration_list_opt '}' // CFA 1843 1846 { $$ = DeclarationNode::newAggregate( $1, new string( DeclarationNode::anonymous.newName() ), $4, $7, false )->addQualifiers( $2 ); } … … 1846 1849 1847 1850 aggregate_type_nobody: // struct, union - {...} 1848 aggregate_key attribute_list_opt no_attr_identifier 1851 aggregate_key attribute_list_opt no_attr_identifier fred 1849 1852 { 1850 1853 typedefTable.makeTypedef( *$3, forall ? TYPEGENname : TYPEDEFname ); 1851 //if ( forall ) typedefTable.changeKind( *$3, TYPEGENname ); // possibly update1852 1854 forall = false; // reset 1853 1855 $$ = DeclarationNode::newAggregate( $1, $3, nullptr, nullptr, false )->addQualifiers( $2 ); 1854 1856 } 1855 | aggregate_key attribute_list_opt type_name 1857 | aggregate_key attribute_list_opt type_name fred 1856 1858 { 1857 1859 // Create new generic declaration with same name as previous forward declaration, where the IDENTIFIER is … … 1867 1869 aggregate_key: 1868 1870 STRUCT 1869 { $$ = DeclarationNode::Struct; }1871 { yyy = true; $$ = DeclarationNode::Struct; } 1870 1872 | UNION 1871 { $$ = DeclarationNode::Union; }1873 { yyy = true; $$ = DeclarationNode::Union; } 1872 1874 | EXCEPTION 1873 { $$ = DeclarationNode::Exception; }1875 { yyy = true; $$ = DeclarationNode::Exception; } 1874 1876 | COROUTINE 1875 { $$ = DeclarationNode::Coroutine; }1877 { yyy = true; $$ = DeclarationNode::Coroutine; } 1876 1878 | MONITOR 1877 { $$ = DeclarationNode::Monitor; }1879 { yyy = true; $$ = DeclarationNode::Monitor; } 1878 1880 | THREAD 1879 { $$ = DeclarationNode::Thread; }1881 { yyy = true; $$ = DeclarationNode::Thread; } 1880 1882 ; 1881 1883 … … 2320 2322 2321 2323 translation_unit: 2322 // empty 2323 {} // empty input file 2324 // empty, input file 2324 2325 | external_definition_list 2325 2326 { parseTree = parseTree ? parseTree->appendList( $1 ) : $1; } … … 2335 2336 ; 2336 2337 2337 // SKULLDUGGERY: Declarations in extern "X" and distribution need to be added to the current lexical scope.2338 // However, external_definition_list creates a new scope around each external_definition, but the pop loses all the2339 // types in the extern "X" and distribution at the end of the block. This version of external_definition_list does2340 2341 // not do push/pop for declarations at the level of the extern "X" and distribution block. Any recursive uses of2342 // external_definition_list within the extern "X" and distribution block correctly pushes/pops for that scope level.2343 external_definition_list_no_pop_push:2344 external_definition2345 | external_definition_list_no_pop_push2346 { forall = xxx; }2347 external_definition2348 { $$ = $1 ? $1->appendList( $3 ) : $3; }2349 ;2350 2351 2338 external_definition_list_opt: 2352 2339 // empty 2353 2340 { $$ = nullptr; } 2354 | external_definition_list_no_pop_push 2341 | external_definition_list 2342 ; 2343 2344 up: 2345 { typedefTable.up(); } 2346 ; 2347 2348 down: 2349 { typedefTable.down(); } 2355 2350 ; 2356 2351 … … 2372 2367 linkage = LinkageSpec::linkageUpdate( yylloc, linkage, $2 ); 2373 2368 } 2374 '{' external_definition_list_opt'}'2369 '{' up external_definition_list_opt down '}' 2375 2370 { 2376 2371 linkage = linkageStack.top(); 2377 2372 linkageStack.pop(); 2378 $$ = $ 5;2373 $$ = $6; 2379 2374 } 2380 2375 | type_qualifier_list 2381 { if ( $1->type->forall ) xxx = forall = true; } // remember generic type 2382 '{' external_definition_list_opt '}' // CFA, namespace 2383 { 2384 for ( DeclarationNode * iter = $4; iter != nullptr; iter = (DeclarationNode *)iter->get_next() ) { 2376 { 2377 if ( $1->type->qualifiers.val ) { SemanticError( yylloc, "CV qualifiers cannot be distributed; only storage-class and forall qualifiers." ); } 2378 if ( $1->type->forall ) xxx = forall = true; // remember generic type 2379 } 2380 '{' up external_definition_list_opt down '}' // CFA, namespace 2381 { 2382 for ( DeclarationNode * iter = $5; iter != nullptr; iter = (DeclarationNode *)iter->get_next() ) { 2385 2383 if ( isMangled( iter->linkage ) ) { // ignore extern "C" 2386 2384 iter->addQualifiers( $1->clone() ); … … 2389 2387 xxx = false; 2390 2388 delete $1; 2391 $$ = $ 4;2389 $$ = $5; 2392 2390 } 2393 2391 | declaration_qualifier_list 2394 { if ( $1->type->forall ) xxx = forall = true; } // remember generic type 2395 '{' external_definition_list_opt '}' // CFA, namespace 2396 { 2397 for ( DeclarationNode * iter = $4; iter != nullptr; iter = (DeclarationNode *)iter->get_next() ) { 2392 { 2393 if ( $1->type->qualifiers.val ) { SemanticError( yylloc, "CV qualifiers cannot be distributed; only storage-class and forall qualifiers." ); } 2394 if ( $1->type->forall ) xxx = forall = true; // remember generic type 2395 } 2396 '{' up external_definition_list_opt down '}' // CFA, namespace 2397 { 2398 for ( DeclarationNode * iter = $5; iter != nullptr; iter = (DeclarationNode *)iter->get_next() ) { 2398 2399 if ( isMangled( iter->linkage ) ) { // ignore extern "C" 2399 2400 iter->addQualifiers( $1->clone() ); … … 2402 2403 xxx = false; 2403 2404 delete $1; 2404 $$ = $ 4;2405 $$ = $5; 2405 2406 } 2406 2407 | declaration_qualifier_list type_qualifier_list 2407 2408 { 2408 // forall must be in the type_qualifier_list2409 if ( $2->type->forall ) xxx = forall = true; // remember generic type2410 } 2411 '{' external_definition_list_opt '}'// CFA, namespace2412 { 2413 for ( DeclarationNode * iter = $ 5; iter != nullptr; iter = (DeclarationNode *)iter->get_next() ) {2409 if ( ($1->type && $1->type->qualifiers.val) || $2->type->qualifiers.val ) { SemanticError( yylloc, "CV qualifiers cannot be distributed; only storage-class and forall qualifiers." ); } 2410 if ( ($1->type && $1->type->forall) || $2->type->forall ) xxx = forall = true; // remember generic type 2411 } 2412 '{' up external_definition_list_opt down '}' // CFA, namespace 2413 { 2414 for ( DeclarationNode * iter = $6; iter != nullptr; iter = (DeclarationNode *)iter->get_next() ) { 2414 2415 if ( isMangled( iter->linkage ) && isMangled( $2->linkage ) ) { // ignore extern "C" 2415 2416 iter->addQualifiers( $1->clone() ); … … 2420 2421 delete $1; 2421 2422 delete $2; 2422 $$ = $ 5;2423 $$ = $6; 2423 2424 } 2424 2425 ; -
src/ResolvExpr/AlternativeFinder.cc
ra12c81f3 rc653b37 1099 1099 argExpansions.emplace_back(); 1100 1100 auto& argE = argExpansions.back(); 1101 argE.reserve( arg.alternatives.size() );1101 // argE.reserve( arg.alternatives.size() ); 1102 1102 1103 1103 for ( const Alternative& actual : arg ) { -
src/ResolvExpr/CommonType.cc
ra12c81f3 rc653b37 24 24 #include "SynTree/Type.h" // for BasicType, BasicType::Kind::... 25 25 #include "SynTree/Visitor.h" // for Visitor 26 #include "Unify.h" // for unifyExact, bindVar,WidenMode26 #include "Unify.h" // for unifyExact, WidenMode 27 27 #include "typeops.h" // for isFtype 28 28 … … 238 238 AssertionSet need, have; 239 239 WidenMode widen( widenFirst, widenSecond ); 240 if ( entry != openVars.end() && ! bindVar(var, voidPointer->get_base(), entry->second, env, need, have, openVars, widen, indexer ) ) return;240 if ( entry != openVars.end() && ! env.bindVar(var, voidPointer->get_base(), entry->second, need, have, openVars, widen, indexer ) ) return; 241 241 } 242 242 } -
src/ResolvExpr/ExplodedActual.h
ra12c81f3 rc653b37 32 32 33 33 ExplodedActual() : env(), cost(Cost::zero), exprs() {} 34 35 34 ExplodedActual( const Alternative& actual, const SymTab::Indexer& indexer ); 35 ExplodedActual(ExplodedActual&&) = default; 36 ExplodedActual& operator= (ExplodedActual&&) = default; 36 37 }; 37 38 } -
src/ResolvExpr/Resolver.cc
ra12c81f3 rc653b37 582 582 583 583 // Make sure we don't widen any existing bindings 584 for ( auto & i : resultEnv ) { 585 i.allowWidening = false; 586 } 587 584 resultEnv.forbidWidening(); 585 588 586 // Find any unbound type variables 589 587 resultEnv.extractOpenVars( openVars ); -
src/ResolvExpr/TypeEnvironment.cc
ra12c81f3 rc653b37 9 9 // Author : Richard C. Bilson 10 10 // Created On : Sun May 17 12:19:47 2015 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Sun May 17 12:23:36 201513 // Update Count : 311 // Last Modified By : Aaron B. Moss 12 // Last Modified On : Mon Jun 18 11:58:00 2018 13 // Update Count : 4 14 14 // 15 15 … … 17 17 #include <algorithm> // for copy, set_intersection 18 18 #include <iterator> // for ostream_iterator, insert_iterator 19 #include <memory> // for unique_ptr 19 20 #include <utility> // for pair, move 20 21 … … 22 23 #include "SynTree/Type.h" // for Type, FunctionType, Type::Fora... 23 24 #include "SynTree/TypeSubstitution.h" // for TypeSubstitution 25 #include "Tuples/Tuples.h" // for isTtype 24 26 #include "TypeEnvironment.h" 27 #include "typeops.h" // for occurs 28 #include "Unify.h" // for unifyInexact 25 29 26 30 namespace ResolvExpr { … … 65 69 } 66 70 71 EqvClass::EqvClass( EqvClass &&other ) 72 : vars{std::move(other.vars)}, type{other.type}, 73 allowWidening{std::move(other.allowWidening)}, data{std::move(other.data)} { 74 other.type = nullptr; 75 } 76 67 77 EqvClass &EqvClass::operator=( const EqvClass &other ) { 68 78 if ( this == &other ) return *this; … … 72 82 } 73 83 84 EqvClass &EqvClass::operator=( EqvClass &&other ) { 85 if ( this == &other ) return *this; 86 delete type; 87 88 vars = std::move(other.vars); 89 type = other.type; 90 other.type = nullptr; 91 allowWidening = std::move(other.allowWidening); 92 data = std::move(other.data); 93 94 return *this; 95 } 96 74 97 EqvClass::~EqvClass() { 75 98 delete type; 99 } 100 101 void EqvClass::set_type( Type* ty ) { 102 if ( ty == type ) return; 103 delete type; 104 type = ty; 76 105 } 77 106 … … 92 121 const EqvClass* TypeEnvironment::lookup( const std::string &var ) const { 93 122 for ( std::list< EqvClass >::const_iterator i = env.begin(); i != env.end(); ++i ) { 94 if ( i->vars.find( var ) != i->vars.end() ) { 95 /// std::cout << var << " is in class "; 96 /// i->print( std::cout ); 97 return &*i; 98 } 99 /// std::cout << var << " is not in class "; 100 /// i->print( std::cout ); 123 if ( i->vars.find( var ) != i->vars.end() ) return &*i; 101 124 } // for 102 125 return nullptr; … … 116 139 } 117 140 118 void TypeEnvironment::add( const EqvClass &eqvClass ) {119 filterOverlappingClasses( env, eqvClass );120 env.push_back( eqvClass );121 }122 123 141 void TypeEnvironment::add( EqvClass &&eqvClass ) { 124 142 filterOverlappingClasses( env, eqvClass ); … … 131 149 newClass.vars.insert( (*i)->get_name() ); 132 150 newClass.data = TypeDecl::Data{ (*i) }; 133 env.push_back( newClass);151 env.push_back( std::move(newClass) ); 134 152 } // for 135 153 } … … 145 163 // transition to TypeSubstitution 146 164 newClass.data = TypeDecl::Data{ TypeDecl::Dtype, false }; 147 add( newClass);165 add( std::move(newClass) ); 148 166 } 149 167 } … … 152 170 for ( std::list< EqvClass >::const_iterator theClass = env.begin(); theClass != env.end(); ++theClass ) { 153 171 for ( std::set< std::string >::const_iterator theVar = theClass->vars.begin(); theVar != theClass->vars.end(); ++theVar ) { 154 /// std::cerr << "adding " << *theVar;155 172 if ( theClass->type ) { 156 /// std::cerr << " bound to ";157 /// theClass->type->print( std::cerr );158 /// std::cerr << std::endl;159 173 sub.add( *theVar, theClass->type ); 160 174 } else if ( theVar != theClass->vars.begin() ) { 161 175 TypeInstType *newTypeInst = new TypeInstType( Type::Qualifiers(), *theClass->vars.begin(), theClass->data.kind == TypeDecl::Ftype ); 162 /// std::cerr << " bound to variable " << *theClass->vars.begin() << std::endl;163 176 sub.add( *theVar, newTypeInst ); 164 177 delete newTypeInst; … … 166 179 } // for 167 180 } // for 168 /// std::cerr << "input env is:" << std::endl;169 /// print( std::cerr, 8 );170 /// std::cerr << "sub is:" << std::endl;171 /// sub.print( std::cerr, 8 );172 181 sub.normalize(); 173 182 } … … 181 190 std::list< EqvClass >::iterator TypeEnvironment::internal_lookup( const std::string &var ) { 182 191 for ( std::list< EqvClass >::iterator i = env.begin(); i != env.end(); ++i ) { 183 if ( i->vars.find( var ) == i->vars.end() ) { 184 return i; 185 } // if 192 if ( i->vars.count( var ) ) return i; 186 193 } // for 187 194 return env.end(); … … 190 197 void TypeEnvironment::simpleCombine( const TypeEnvironment &second ) { 191 198 env.insert( env.end(), second.env.begin(), second.env.end() ); 192 }193 194 void TypeEnvironment::combine( const TypeEnvironment &second, Type *(*combineFunc)( Type*, Type* ) ) {195 TypeEnvironment secondCopy( second );196 for ( std::list< EqvClass >::iterator firstClass = env.begin(); firstClass != env.end(); ++firstClass ) {197 EqvClass &newClass = *firstClass;198 std::set< std::string > newVars;199 for ( std::set< std::string >::const_iterator var = firstClass->vars.begin(); var != firstClass->vars.end(); ++var ) {200 std::list< EqvClass >::iterator secondClass = secondCopy.internal_lookup( *var );201 if ( secondClass != secondCopy.env.end() ) {202 newVars.insert( secondClass->vars.begin(), secondClass->vars.end() );203 if ( secondClass->type ) {204 if ( newClass.type ) {205 Type *newType = combineFunc( newClass.type, secondClass->type );206 delete newClass.type;207 newClass.type = newType;208 newClass.allowWidening = newClass.allowWidening && secondClass->allowWidening;209 } else {210 newClass.type = secondClass->type->clone();211 newClass.allowWidening = secondClass->allowWidening;212 } // if213 } // if214 secondCopy.env.erase( secondClass );215 } // if216 } // for217 newClass.vars.insert( newVars.begin(), newVars.end() );218 } // for219 for ( std::list< EqvClass >::iterator secondClass = secondCopy.env.begin(); secondClass != secondCopy.env.end(); ++secondClass ) {220 env.push_back( *secondClass );221 } // for222 199 } 223 200 … … 241 218 } 242 219 220 bool isFtype( Type *type ) { 221 if ( dynamic_cast< FunctionType* >( type ) ) { 222 return true; 223 } else if ( TypeInstType *typeInst = dynamic_cast< TypeInstType* >( type ) ) { 224 return typeInst->get_isFtype(); 225 } // if 226 return false; 227 } 228 229 bool tyVarCompatible( const TypeDecl::Data & data, Type *type ) { 230 switch ( data.kind ) { 231 case TypeDecl::Dtype: 232 // to bind to an object type variable, the type must not be a function type. 233 // if the type variable is specified to be a complete type then the incoming 234 // type must also be complete 235 // xxx - should this also check that type is not a tuple type and that it's not a ttype? 236 return ! isFtype( type ) && (! data.isComplete || type->isComplete() ); 237 case TypeDecl::Ftype: 238 return isFtype( type ); 239 case TypeDecl::Ttype: 240 // ttype unifies with any tuple type 241 return dynamic_cast< TupleType * >( type ) || Tuples::isTtype( type ); 242 } // switch 243 return false; 244 } 245 246 bool TypeEnvironment::bindVar( TypeInstType *typeInst, Type *bindTo, const TypeDecl::Data & data, AssertionSet &need, AssertionSet &have, const OpenVarSet &openVars, WidenMode widenMode, const SymTab::Indexer &indexer ) { 247 248 // remove references from other, so that type variables can only bind to value types 249 bindTo = bindTo->stripReferences(); 250 OpenVarSet::const_iterator tyvar = openVars.find( typeInst->get_name() ); 251 assert( tyvar != openVars.end() ); 252 if ( ! tyVarCompatible( tyvar->second, bindTo ) ) { 253 return false; 254 } // if 255 if ( occurs( bindTo, typeInst->get_name(), *this ) ) { 256 return false; 257 } // if 258 auto curClass = internal_lookup( typeInst->get_name() ); 259 if ( curClass != env.end() ) { 260 if ( curClass->type ) { 261 Type *common = 0; 262 // attempt to unify equivalence class type (which has qualifiers stripped, so they must be restored) with the type to bind to 263 std::unique_ptr< Type > newType( curClass->type->clone() ); 264 newType->get_qualifiers() = typeInst->get_qualifiers(); 265 if ( unifyInexact( newType.get(), bindTo, *this, need, have, openVars, widenMode & WidenMode( curClass->allowWidening, true ), indexer, common ) ) { 266 if ( common ) { 267 common->get_qualifiers() = Type::Qualifiers{}; 268 curClass->set_type( common ); 269 } // if 270 } else return false; 271 } else { 272 Type* newType = bindTo->clone(); 273 newType->get_qualifiers() = Type::Qualifiers{}; 274 curClass->set_type( newType ); 275 curClass->allowWidening = widenMode.widenFirst && widenMode.widenSecond; 276 } // if 277 } else { 278 EqvClass newClass; 279 newClass.vars.insert( typeInst->get_name() ); 280 newClass.type = bindTo->clone(); 281 newClass.type->get_qualifiers() = Type::Qualifiers(); 282 newClass.allowWidening = widenMode.widenFirst && widenMode.widenSecond; 283 newClass.data = data; 284 env.push_back( std::move(newClass) ); 285 } // if 286 return true; 287 } 288 289 bool TypeEnvironment::bindVarToVar( TypeInstType *var1, TypeInstType *var2, const TypeDecl::Data & data, AssertionSet &need, AssertionSet &have, const OpenVarSet &openVars, WidenMode widenMode, const SymTab::Indexer &indexer ) { 290 291 auto class1 = internal_lookup( var1->get_name() ); 292 auto class2 = internal_lookup( var2->get_name() ); 293 294 // exit early if variables already bound together 295 if ( class1 != env.end() && class1 == class2 ) { 296 class1->allowWidening &= widenMode; 297 return true; 298 } 299 300 bool widen1 = false, widen2 = false; 301 const Type *type1 = nullptr, *type2 = nullptr; 302 303 // check for existing bindings, perform occurs check 304 if ( class1 != env.end() ) { 305 if ( class1->type ) { 306 if ( occurs( class1->type, var2->get_name(), *this ) ) return false; 307 type1 = class1->type; 308 } // if 309 widen1 = widenMode.widenFirst && class1->allowWidening; 310 } // if 311 if ( class2 != env.end() ) { 312 if ( class2->type ) { 313 if ( occurs( class2->type, var1->get_name(), *this ) ) return false; 314 type2 = class2->type; 315 } // if 316 widen2 = widenMode.widenSecond && class2->allowWidening; 317 } // if 318 319 if ( type1 && type2 ) { 320 // both classes bound, merge if bound types can be unified 321 std::unique_ptr<Type> newType1{ type1->clone() }, newType2{ type2->clone() }; 322 WidenMode newWidenMode{ widen1, widen2 }; 323 Type *common = 0; 324 if ( unifyInexact( newType1.get(), newType2.get(), *this, need, have, openVars, newWidenMode, indexer, common ) ) { 325 class1->vars.insert( class2->vars.begin(), class2->vars.end() ); 326 class1->allowWidening = widen1 && widen2; 327 if ( common ) { 328 common->get_qualifiers() = Type::Qualifiers{}; 329 class1->set_type( common ); 330 } 331 env.erase( class2 ); 332 } else return false; 333 } else if ( class1 != env.end() && class2 != env.end() ) { 334 // both classes exist, at least one unbound, merge unconditionally 335 if ( type1 ) { 336 class1->vars.insert( class2->vars.begin(), class2->vars.end() ); 337 class1->allowWidening = widen1; 338 env.erase( class2 ); 339 } else { 340 class2->vars.insert( class1->vars.begin(), class1->vars.end() ); 341 class2->allowWidening = widen2; 342 env.erase( class1 ); 343 } // if 344 } else if ( class1 != env.end() ) { 345 // var2 unbound, add to class1 346 class1->vars.insert( var2->get_name() ); 347 class1->allowWidening = widen1; 348 } else if ( class2 != env.end() ) { 349 // var1 unbound, add to class2 350 class2->vars.insert( var1->get_name() ); 351 class2->allowWidening = widen2; 352 } else { 353 // neither var bound, create new class 354 EqvClass newClass; 355 newClass.vars.insert( var1->get_name() ); 356 newClass.vars.insert( var2->get_name() ); 357 newClass.allowWidening = widen1 && widen2; 358 newClass.data = data; 359 env.push_back( std::move(newClass) ); 360 } // if 361 return true; 362 } 363 364 void TypeEnvironment::forbidWidening() { 365 for ( EqvClass& c : env ) c.allowWidening = false; 366 } 367 243 368 std::ostream & operator<<( std::ostream & out, const TypeEnvironment & env ) { 244 369 env.print( out ); -
src/ResolvExpr/TypeEnvironment.h
ra12c81f3 rc653b37 9 9 // Author : Richard C. Bilson 10 10 // Created On : Sun May 17 12:24:58 2015 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Sat Jul 22 09:35:45 201713 // Update Count : 311 // Last Modified By : Aaron B. Moss 12 // Last Modified On : Mon Jun 18 11:58:00 2018 13 // Update Count : 4 14 14 // 15 15 … … 21 21 #include <set> // for set 22 22 #include <string> // for string 23 #include <utility> // for move, swap 24 25 #include "WidenMode.h" // for WidenMode 23 26 24 27 #include "SynTree/Declaration.h" // for TypeDecl::Data, DeclarationWit... … … 77 80 EqvClass( const EqvClass &other ); 78 81 EqvClass( const EqvClass &other, const Type *ty ); 82 EqvClass( EqvClass &&other ); 79 83 EqvClass &operator=( const EqvClass &other ); 84 EqvClass &operator=( EqvClass &&other ); 80 85 ~EqvClass(); 81 86 void print( std::ostream &os, Indenter indent = {} ) const; 87 88 /// Takes ownership of `ty`, freeing old `type` 89 void set_type(Type* ty); 82 90 }; 83 91 … … 85 93 public: 86 94 const EqvClass* lookup( const std::string &var ) const; 87 void add( const EqvClass &eqvClass );95 private: 88 96 void add( EqvClass &&eqvClass ); 97 public: 89 98 void add( const Type::ForallList &tyDecls ); 90 99 void add( const TypeSubstitution & sub ); … … 94 103 bool isEmpty() const { return env.empty(); } 95 104 void print( std::ostream &os, Indenter indent = {} ) const; 96 void combine( const TypeEnvironment &second, Type *(*combineFunc)( Type*, Type* ) );105 // void combine( const TypeEnvironment &second, Type *(*combineFunc)( Type*, Type* ) ); 97 106 void simpleCombine( const TypeEnvironment &second ); 98 107 void extractOpenVars( OpenVarSet &openVars ) const; … … 103 112 void addActual( const TypeEnvironment& actualEnv, OpenVarSet& openVars ); 104 113 105 typedef std::list< EqvClass >::iterator iterator; 106 iterator begin() { return env.begin(); } 107 iterator end() { return env.end(); } 108 typedef std::list< EqvClass >::const_iterator const_iterator; 109 const_iterator begin() const { return env.begin(); } 110 const_iterator end() const { return env.end(); } 114 /// Binds the type class represented by `typeInst` to the type `bindTo`; will add 115 /// the class if needed. Returns false on failure. 116 bool bindVar( TypeInstType *typeInst, Type *bindTo, const TypeDecl::Data & data, AssertionSet &need, AssertionSet &have, const OpenVarSet &openVars, WidenMode widenMode, const SymTab::Indexer &indexer ); 117 118 /// Binds the type classes represented by `var1` and `var2` together; will add 119 /// one or both classes if needed. Returns false on failure. 120 bool bindVarToVar( TypeInstType *var1, TypeInstType *var2, const TypeDecl::Data & data, AssertionSet &need, AssertionSet &have, const OpenVarSet &openVars, WidenMode widenMode, const SymTab::Indexer &indexer ); 121 122 /// Disallows widening for all bindings in the environment 123 void forbidWidening(); 124 125 using iterator = std::list< EqvClass >::const_iterator; 126 iterator begin() const { return env.begin(); } 127 iterator end() const { return env.end(); } 128 111 129 private: 112 130 std::list< EqvClass > env; 131 113 132 std::list< EqvClass >::iterator internal_lookup( const std::string &var ); 114 133 }; -
src/ResolvExpr/Unify.cc
ra12c81f3 rc653b37 9 9 // Author : Richard C. Bilson 10 10 // Created On : Sun May 17 12:27:10 2015 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Thu Mar 16 16:22:54 201713 // Update Count : 4 211 // Last Modified By : Aaron B. Moss 12 // Last Modified On : Mon Jun 18 11:58:00 2018 13 // Update Count : 43 14 14 // 15 15 … … 129 129 } 130 130 131 bool isFtype( Type *type ) {132 if ( dynamic_cast< FunctionType* >( type ) ) {133 return true;134 } else if ( TypeInstType *typeInst = dynamic_cast< TypeInstType* >( type ) ) {135 return typeInst->get_isFtype();136 } // if137 return false;138 }139 140 bool tyVarCompatible( const TypeDecl::Data & data, Type *type ) {141 switch ( data.kind ) {142 case TypeDecl::Dtype:143 // to bind to an object type variable, the type must not be a function type.144 // if the type variable is specified to be a complete type then the incoming145 // type must also be complete146 // xxx - should this also check that type is not a tuple type and that it's not a ttype?147 return ! isFtype( type ) && (! data.isComplete || type->isComplete() );148 case TypeDecl::Ftype:149 return isFtype( type );150 case TypeDecl::Ttype:151 // ttype unifies with any tuple type152 return dynamic_cast< TupleType * >( type ) || Tuples::isTtype( type );153 } // switch154 return false;155 }156 157 bool bindVar( TypeInstType *typeInst, Type *other, const TypeDecl::Data & data, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, const OpenVarSet &openVars, WidenMode widenMode, const SymTab::Indexer &indexer ) {158 // remove references from other, so that type variables can only bind to value types159 other = other->stripReferences();160 OpenVarSet::const_iterator tyvar = openVars.find( typeInst->get_name() );161 assert( tyvar != openVars.end() );162 if ( ! tyVarCompatible( tyvar->second, other ) ) {163 return false;164 } // if165 if ( occurs( other, typeInst->get_name(), env ) ) {166 return false;167 } // if168 if ( const EqvClass *curClass = env.lookup( typeInst->get_name() ) ) {169 if ( curClass->type ) {170 Type *common = 0;171 // attempt to unify equivalence class type (which has qualifiers stripped, so they must be restored) with the type to bind to172 std::unique_ptr< Type > newType( curClass->type->clone() );173 newType->get_qualifiers() = typeInst->get_qualifiers();174 if ( unifyInexact( newType.get(), other, env, needAssertions, haveAssertions, openVars, widenMode & WidenMode( curClass->allowWidening, true ), indexer, common ) ) {175 if ( common ) {176 common->get_qualifiers() = Type::Qualifiers();177 env.add( EqvClass{ *curClass, common } );178 } // if179 return true;180 } else {181 return false;182 } // if183 } else {184 EqvClass newClass { *curClass, other };185 newClass.type->get_qualifiers() = Type::Qualifiers();186 newClass.allowWidening = widenMode.widenFirst && widenMode.widenSecond;187 env.add( std::move(newClass) );188 } // if189 } else {190 EqvClass newClass;191 newClass.vars.insert( typeInst->get_name() );192 newClass.type = other->clone();193 newClass.type->get_qualifiers() = Type::Qualifiers();194 newClass.allowWidening = widenMode.widenFirst && widenMode.widenSecond;195 newClass.data = data;196 env.add( newClass );197 } // if198 return true;199 }200 201 bool bindVarToVar( TypeInstType *var1, TypeInstType *var2, const TypeDecl::Data & data, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, const OpenVarSet &openVars, WidenMode widenMode, const SymTab::Indexer &indexer ) {202 bool result = true;203 const EqvClass *class1 = env.lookup( var1->get_name() );204 const EqvClass *class2 = env.lookup( var2->get_name() );205 bool widen1 = false, widen2 = false;206 Type *type1 = nullptr, *type2 = nullptr;207 208 if ( class1 ) {209 if ( class1->type ) {210 if ( occurs( class1->type, var2->get_name(), env ) ) {211 return false;212 } // if213 type1 = class1->type->clone();214 } // if215 widen1 = widenMode.widenFirst && class1->allowWidening;216 } // if217 if ( class2 ) {218 if ( class2->type ) {219 if ( occurs( class2->type, var1->get_name(), env ) ) {220 return false;221 } // if222 type2 = class2->type->clone();223 } // if224 widen2 = widenMode.widenSecond && class2->allowWidening;225 } // if226 227 if ( type1 && type2 ) {228 // std::cerr << "has type1 && type2" << std::endl;229 WidenMode newWidenMode ( widen1, widen2 );230 Type *common = 0;231 if ( unifyInexact( type1, type2, env, needAssertions, haveAssertions, openVars, newWidenMode, indexer, common ) ) {232 EqvClass newClass1 = *class1;233 newClass1.vars.insert( class2->vars.begin(), class2->vars.end() );234 newClass1.allowWidening = widen1 && widen2;235 if ( common ) {236 common->get_qualifiers() = Type::Qualifiers();237 delete newClass1.type;238 newClass1.type = common;239 } // if240 env.add( std::move(newClass1) );241 } else {242 result = false;243 } // if244 } else if ( class1 && class2 ) {245 if ( type1 ) {246 EqvClass newClass1 = *class1;247 newClass1.vars.insert( class2->vars.begin(), class2->vars.end() );248 newClass1.allowWidening = widen1;249 env.add( std::move(newClass1) );250 } else {251 EqvClass newClass2 = *class2;252 newClass2.vars.insert( class1->vars.begin(), class1->vars.end() );253 newClass2.allowWidening = widen2;254 env.add( std::move(newClass2) );255 } // if256 } else if ( class1 ) {257 EqvClass newClass1 = *class1;258 newClass1.vars.insert( var2->get_name() );259 newClass1.allowWidening = widen1;260 env.add( std::move(newClass1) );261 } else if ( class2 ) {262 EqvClass newClass2 = *class2;263 newClass2.vars.insert( var1->get_name() );264 newClass2.allowWidening = widen2;265 env.add( std::move(newClass2) );266 } else {267 EqvClass newClass;268 newClass.vars.insert( var1->get_name() );269 newClass.vars.insert( var2->get_name() );270 newClass.allowWidening = widen1 && widen2;271 newClass.data = data;272 env.add( newClass );273 } // if274 delete type1;275 delete type2;276 return result;277 }278 279 131 bool unify( Type *type1, Type *type2, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, OpenVarSet &openVars, const SymTab::Indexer &indexer ) { 280 132 OpenVarSet closedVars; … … 321 173 322 174 if ( isopen1 && isopen2 && entry1->second == entry2->second ) { 323 result = bindVarToVar( var1, var2, entry1->second, env, needAssertions, haveAssertions, openVars, widenMode, indexer );175 result = env.bindVarToVar( var1, var2, entry1->second, needAssertions, haveAssertions, openVars, widenMode, indexer ); 324 176 } else if ( isopen1 ) { 325 result = bindVar( var1, type2, entry1->second, env, needAssertions, haveAssertions, openVars, widenMode, indexer );177 result = env.bindVar( var1, type2, entry1->second, needAssertions, haveAssertions, openVars, widenMode, indexer ); 326 178 } else if ( isopen2 ) { // TODO: swap widenMode values in call, since type positions are flipped? 327 result = bindVar( var2, type1, entry2->second, env, needAssertions, haveAssertions, openVars, widenMode, indexer );179 result = env.bindVar( var2, type1, entry2->second, needAssertions, haveAssertions, openVars, widenMode, indexer ); 328 180 } else { 329 181 PassVisitor<Unify> comparator( type2, env, needAssertions, haveAssertions, openVars, widenMode, indexer ); -
src/ResolvExpr/Unify.h
ra12c81f3 rc653b37 9 9 // Author : Richard C. Bilson 10 10 // Created On : Sun May 17 13:09:04 2015 11 // Last Modified By : Peter A. Buhr12 // Last Modified On : Fri Jul 21 23:09:34 201713 // Update Count : 311 // Last Modified By : Aaron B. Moss 12 // Last Modified On : Mon Jun 18 11:58:00 2018 13 // Update Count : 4 14 14 // 15 15 … … 21 21 #include "SynTree/Declaration.h" // for TypeDecl, TypeDecl::Data 22 22 #include "TypeEnvironment.h" // for AssertionSet, OpenVarSet 23 #include "WidenMode.h" // for WidenMode 23 24 24 25 class Type; … … 29 30 30 31 namespace ResolvExpr { 31 struct WidenMode {32 WidenMode( bool widenFirst, bool widenSecond ): widenFirst( widenFirst ), widenSecond( widenSecond ) {}33 WidenMode &operator|=( const WidenMode &other ) { widenFirst |= other.widenFirst; widenSecond |= other.widenSecond; return *this; }34 WidenMode &operator&=( const WidenMode &other ) { widenFirst &= other.widenFirst; widenSecond &= other.widenSecond; return *this; }35 WidenMode operator|( const WidenMode &other ) { WidenMode newWM( *this ); newWM |= other; return newWM; }36 WidenMode operator&( const WidenMode &other ) { WidenMode newWM( *this ); newWM &= other; return newWM; }37 operator bool() { return widenFirst && widenSecond; }38 39 bool widenFirst : 1, widenSecond : 1;40 };41 42 bool bindVar( TypeInstType *typeInst, Type *other, const TypeDecl::Data & data, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, const OpenVarSet &openVars, WidenMode widenMode, const SymTab::Indexer &indexer );43 32 bool unify( Type *type1, Type *type2, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, OpenVarSet &openVars, const SymTab::Indexer &indexer ); 44 33 bool unify( Type *type1, Type *type2, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, OpenVarSet &openVars, const SymTab::Indexer &indexer, Type *&commonType ); 45 34 bool unifyExact( Type *type1, Type *type2, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, OpenVarSet &openVars, const SymTab::Indexer &indexer ); 35 bool unifyInexact( Type *type1, Type *type2, TypeEnvironment &env, AssertionSet &needAssertions, AssertionSet &haveAssertions, const OpenVarSet &openVars, WidenMode widenMode, const SymTab::Indexer &indexer, Type *&common ); 46 36 47 37 template< typename Iterator1, typename Iterator2 > -
src/SynTree/Type.cc
ra12c81f3 rc653b37 10 10 // Created On : Mon May 18 07:44:20 2015 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Mon Sep 25 15:16:32 201713 // Update Count : 3 812 // Last Modified On : Fri Jun 22 10:17:19 2018 13 // Update Count : 39 14 14 // 15 15 #include "Type.h" … … 69 69 70 70 // These must remain in the same order as the corresponding bit fields. 71 const char * Type::FuncSpecifiersNames[] = { "inline", " fortran", "_Noreturn" };71 const char * Type::FuncSpecifiersNames[] = { "inline", "_Noreturn", "fortran" }; 72 72 const char * Type::StorageClassesNames[] = { "extern", "static", "auto", "register", "_Thread_local" }; 73 73 const char * Type::QualifiersNames[] = { "const", "restrict", "volatile", "lvalue", "mutex", "_Atomic" }; -
src/tests/.gitignore
ra12c81f3 rc653b37 1 1 .out/ 2 2 .err/ 3 .type -
src/tests/concurrent/coroutineYield.c
ra12c81f3