- Timestamp:
- Jun 23, 2019, 11:41:59 PM (6 years ago)
- Branches:
- ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum
- Children:
- b60ed54
- Parents:
- f2f22e3
- Location:
- doc
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/bibliography/pl.bib
rf2f22e3 r8f079f0 948 948 author = {{\textsf{C}{$\mathbf{\forall}$} Features}}, 949 949 howpublished= {\href{https://plg.uwaterloo.ca/~cforall/features}{https://\-plg.uwaterloo.ca/\-$\sim$cforall/\-features}}, 950 } 951 952 @misc{CforallBenchMarks, 953 contributer = {pabuhr@plg}, 954 key = {Cforall Benchmarks}, 955 author = {{\textsf{C}{$\mathbf{\forall}$} Benchmarks}}, 956 howpublished= {\href{https://plg.uwaterloo.ca/~cforall/benchmarks}{https://\-plg.uwaterloo.ca/\-$\sim$cforall/\-benchmarks}}, 950 957 } 951 958 … … 1919 1926 year = 1965, 1920 1927 note = {Reprinted in \cite{Genuys68} pp. 43--112.} 1928 } 1929 1930 @inproceedings{Adya02, 1931 contributer = {pabuhr@plg}, 1932 author = {Adya, Atul and Howell, Jon and Theimer, Marvin and Bolosky, William J. and Douceur, John R.}, 1933 title = {Cooperative Task Management Without Manual Stack Management}, 1934 booktitle = {Proceedings of the General Track of the Annual Conference on USENIX Annual Technical Conference}, 1935 series = {ATEC '02}, 1936 year = {2002}, 1937 pages = {289-302}, 1938 publisher = {USENIX Association}, 1939 address = {Berkeley, CA, USA}, 1940 } 1941 1942 @misc{CoroutineTS, 1943 keywords = {Coroutines TS, C++20, working draft}, 1944 contributer = {pabuhr@plg}, 1945 author = {Gor Nishanov}, 1946 title = {Merge Coroutines TS into C++20 Working Draft}, 1947 year = 2019, 1948 month = feb, 1949 howpublished= {\href{http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2019/p0912r5.html} 1950 {http://\-www.open-std.org/\-jtc1/\-sc22/\-wg21/\-docs/\-papers/\-2019/p0912r5.html}}, 1921 1951 } 1922 1952 … … 4359 4389 } 4360 4390 4391 4361 4392 @article{Liskov86, 4362 4393 keywords = {synchronous communication, concurrency}, … … 4371 4402 year = {}, 4372 4403 pages = {}, 4404 } 4405 4406 @misc{libdill, 4407 keywords = {libdill/libmill Thread Library}, 4408 contributer = {pabuhr@plg}, 4409 author = {Alex Cornejo, et al}, 4410 title = {libdill Thread Library}, 4411 year = 2019, 4412 howpublished= {\href{http://libdill.org/libdill-2.14.tar.gz} 4413 {http://\-libdill.org/\-libdill-2.14.tar.gz}}, 4373 4414 } 4374 4415 … … 4493 4534 year = 2016, 4494 4535 howpublished= {\href{http://blog.reverberate.org/2016/01/making-arbitrarily-large-binaries-from.html} 4495 { 4496 {http://blog.reverberate.org/\-2016/\-01/\-making-arbitrarily-large-binaries-from.html} 4497 }}, 4536 {http://blog.reverberate.org/\-2016/\-01/\-making-arbitrarily-large-binaries-from.html}}, 4498 4537 optnote = {Accessed: 2016-09}, 4499 4538 } … … 4517 4556 trivial changes to the source code of the library. 4518 4557 } 4558 } 4559 4560 @misc{Marcel, 4561 keywords = {Marcel Thread Library}, 4562 contributer = {pabuhr@plg}, 4563 author = {Gabriel Antoniu, et al}, 4564 title = {Marcel Thread Library}, 4565 year = 2011, 4566 howpublished= {\href{https://gforge.inria.fr/frs/download.php/file/28643/marcel-2.99.3.tar.gz} 4567 {https://\-gforge.inria.fr/\-frs/\-download.php/\-file/\-28643/\-marcel-2.99.3.tar.gz}}, 4519 4568 } 4520 4569 -
doc/papers/AMA/AMA-stix/ama/WileyNJD-v2.cls
rf2f22e3 r8f079f0 2444 2444 \@afterheading} 2445 2445 2446 \renewcommand\section{\@startsection{section}{1}{\z@}{-2 7pt \@plus -2pt \@minus -2pt}{12\p@}{\sectionfont}}%2447 \renewcommand\subsection{\@startsection{subsection}{2}{\z@}{-2 3pt \@plus -2pt \@minus -2pt}{5\p@}{\subsectionfont}}%2446 \renewcommand\section{\@startsection{section}{1}{\z@}{-25pt \@plus -2pt \@minus -2pt}{12\p@}{\sectionfont}}% 2447 \renewcommand\subsection{\@startsection{subsection}{2}{\z@}{-22pt \@plus -2pt \@minus -2pt}{5\p@}{\subsectionfont}}% 2448 2448 \renewcommand\subsubsection{\@startsection{subsubsection}{3}{\z@}{-20pt \@plus -2pt \@minus -2pt}{2\p@}{\subsubsectionfont}}% 2449 2449 % -
doc/papers/concurrency/Paper.tex
rf2f22e3 r8f079f0 157 157 __float80, float80, __float128, float128, forall, ftype, generator, _Generic, _Imaginary, __imag, __imag__, 158 158 inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or, 159 otype, restrict, __restrict, __restrict__, __signed, __signed__, _Static_assert, suspend,thread,159 otype, restrict, __restrict, __restrict__, __signed, __signed__, _Static_assert, thread, 160 160 _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__, 161 161 virtual, __volatile, __volatile__, waitfor, when, with, zero_t}, … … 303 303 However, \Celeven concurrency is largely wrappers for a subset of the pthreads library~\cite{Butenhof97,Pthreads}, and \Celeven and pthreads concurrency is simple, based on thread fork/join in a function and a few locks, which is low-level and error prone; 304 304 no high-level language concurrency features are defined. 305 Interestingly, almost a decade after publication of the \Celeven standard, neither gcc-8, clang- 8nor msvc-19 (most recent versions) support the \Celeven include @threads.h@, indicating little interest in the C11 concurrency approach.305 Interestingly, almost a decade after publication of the \Celeven standard, neither gcc-8, clang-9 nor msvc-19 (most recent versions) support the \Celeven include @threads.h@, indicating little interest in the C11 concurrency approach. 306 306 Finally, while the \Celeven standard does not state a threading model, the historical association with pthreads suggests implementations would adopt kernel-level threading (1:1)~\cite{ThreadModel}. 307 307 … … 313 313 From 2000 onwards, languages like Go~\cite{Go}, Erlang~\cite{Erlang}, Haskell~\cite{Haskell}, D~\cite{D}, and \uC~\cite{uC++,uC++book} have championed the M:N user-threading model, and many user-threading libraries have appeared~\cite{Qthreads,MPC,Marcel}, including putting green threads back into Java~\cite{Quasar}. 314 314 The main argument for user-level threading is that they are lighter weight than kernel threads (locking and context switching do not cross the kernel boundary), so there is less restriction on programming styles that encourage large numbers of threads performing medium work-units to facilitate load balancing by the runtime~\cite{Verch12}. 315 As well, user-threading facilitates a simpler concurrency approach using thread objects that leverage sequential patterns versus events with call-backs~\cite{ vonBehren03}.315 As well, user-threading facilitates a simpler concurrency approach using thread objects that leverage sequential patterns versus events with call-backs~\cite{Adya02,vonBehren03}. 316 316 Finally, performant user-threading implementations (both time and space) meet or exceed direct kernel-threading implementations, while achieving the programming advantages of high concurrency levels and safety. 317 317 … … 613 613 Finally, an explicit generator type provides both design and performance benefits, such as multiple type-safe interface functions taking and returning arbitrary types. 614 614 \begin{cfa} 615 int ?()( Fib & fib ) with( fib ) { return `resume( fib )`.fn; } // function-call interface616 int ?()( Fib & fib, int N ) with( fib ) { for ( N - 1 ) `fib()`; return `fib()`; } // use simple interface617 double ?()( Fib & fib ) with( fib ) { return (int)`fib()` / 3.14159; } // cast prevents recursive call618 sout | (int)f1() | (double)f1() | f2( 2 ); // simple interface, cast selects call based on return type, step 2 values615 int ?()( Fib & fib ) { return `resume( fib )`.fn; } $\C[3.9in]{// function-call interface}$ 616 int ?()( Fib & fib, int N ) { for ( N - 1 ) `fib()`; return `fib()`; } $\C{// use function-call interface to skip N values}$ 617 double ?()( Fib & fib ) { return (int)`fib()` / 3.14159; } $\C{// different return type, cast prevents recursive call}\CRT$ 618 sout | (int)f1() | (double)f1() | f2( 2 ); // alternative interface, cast selects call based on return type, step 2 values 619 619 \end{cfa} 620 620 Now, the generator can be a separately-compiled opaque-type only accessed through its interface functions. … … 628 628 With respect to safety, we believe static analysis can discriminate local state from temporary variables in a generator, \ie variable usage spanning @suspend@, and generate a compile-time error. 629 629 Finally, our current experience is that most generator problems have simple data state, including local state, but complex execution state, so the burden of creating the generator type is small. 630 As well, C programmers are not afraid withthis kind of semantic programming requirement, if it results in very small, fast generators.630 As well, C programmers are not afraid of this kind of semantic programming requirement, if it results in very small, fast generators. 631 631 632 632 Figure~\ref{f:CFAFormatGen} shows an asymmetric \newterm{input generator}, @Fmt@, for restructuring text into groups of characters of fixed-size blocks, \ie the input on the left is reformatted into the output on the right, where newlines are ignored. … … 796 796 This semantics is basically a tail-call optimization, which compilers already perform. 797 797 The example shows the assembly code to undo the generator's entry code before the direct jump. 798 This assembly code depends on what entry code is generated, specifically if there are local variables and the level of optimization.798 This assembly code depends on what entry code is generated, specifically if there are local variables, and the level of optimization. 799 799 To provide this new calling convention requires a mechanism built into the compiler, which is beyond the scope of \CFA at this time. 800 800 Nevertheless, it is possible to hand generate any symmetric generators for proof of concept and performance testing. 801 A compiler could also eliminate other artifacts in the generator simulation to further increase performance .801 A compiler could also eliminate other artifacts in the generator simulation to further increase performance, \eg LLVM has various coroutine support~\cite{CoroutineTS}, and \CFA can leverage this support should it fork @clang@. 802 802 803 803 \begin{figure} … … 1213 1213 Hence, a compromise solution is necessary that works for asymmetric (acyclic) and symmetric (cyclic) coroutines. 1214 1214 1215 Our solution for coroutine termination works well for the most common asymmetric and symmetric coroutine usage-patterns. 1215 Our solution is to context switch back to the first resumer (starter) once the coroutine ends. 1216 This semantics works well for the most common asymmetric and symmetric coroutine usage-patterns. 1216 1217 For asymmetric coroutines, it is common for the first resumer (starter) coroutine to be the only resumer. 1217 1218 All previous generators converted to coroutines have this property. … … 1245 1246 1246 1247 1247 \subsection{ (Generator)Coroutine Implementation}1248 \subsection{Generator / Coroutine Implementation} 1248 1249 1249 1250 A significant implementation challenge for generators/coroutines (and threads in Section~\ref{s:threads}) is adding extra fields to the custom types and related functions, \eg inserting code after/before the coroutine constructor/destructor and @main@ to create/initialize/de-initialize/destroy any extra fields, \eg stack. … … 1254 1255 class myCoroutine inherits baseCoroutine { ... } 1255 1256 \end{cfa} 1256 The problem is that the programming language and its tool chain, \eg debugger, @valgrind@, need to understand @baseCoroutine@ because it infers special property, so type @baseCoroutine@ becomes a de facto keyword and all types inheriting from it are implicitly custom types.1257 As well,some special properties are not handled by existing language semantics, \eg the execution of constructors/destructors is in the wrong order to implicitly start threads because the thread must start \emph{after} all constructors as it relies on a completely initialized object, but the inherited constructor runs \emph{before} the derived.1257 % The problem is that the programming language and its tool chain, \eg debugger, @valgrind@, need to understand @baseCoroutine@ because it infers special property, so type @baseCoroutine@ becomes a de facto keyword and all types inheriting from it are implicitly custom types. 1258 The problem is that some special properties are not handled by existing language semantics, \eg the execution of constructors/destructors is in the wrong order to implicitly start threads because the thread must start \emph{after} all constructors as it relies on a completely initialized object, but the inherited constructor runs \emph{before} the derived. 1258 1259 Alternatives, such as explicitly starting threads as in Java, are repetitive and forgetting to call start is a common source of errors. 1259 1260 An alternative is composition: … … 1267 1268 However, there is nothing preventing wrong placement or multiple declarations. 1268 1269 1269 \CFA custom types make any special properties explicit to the language and its tool chain, \eg the language code-generator knows where to inject code and when it is unsafe to perform certain optimizations, and IDEs using simple parsing can find and manipulate types with special properties. 1270 \CFA custom types make any special properties explicit to the language and its tool chain, \eg the language code-generator knows where to inject code 1271 % and when it is unsafe to perform certain optimizations, 1272 and IDEs using simple parsing can find and manipulate types with special properties. 1270 1273 The downside of this approach is that it makes custom types a special case in the language. 1271 1274 Users wanting to extend custom types or build their own can only do so in ways offered by the language. … … 1282 1285 \end{cfa} 1283 1286 Note, copying generators/coroutines/threads is not meaningful. 1284 For example, a coroutine retains its last resumer and suspends back to it;1285 having a copy also suspend back to the same resumer is undefined semantics.1287 For example, both the resumer and suspender descriptors can have bi-directional pointers; 1288 copying these coroutines does not update the internal pointers so behaviour of both copies would be difficult to understand. 1286 1289 Furthermore, two coroutines cannot logically execute on the same stack. 1287 1290 A deep coroutine copy, which copies the stack, is also meaningless in an unmanaged language (no garbage collection), like C, because the stack may contain pointers to object within it that require updating for the copy. 1288 1291 The \CFA @dtype@ property provides no \emph{implicit} copying operations and the @is_coroutine@ trait provides no \emph{explicit} copying operations, so all coroutines must be passed by reference (pointer). 1289 The function definitions ensures there is a statically-typed @main@ function that is the starting point (first stack frame) of a coroutine, and a mechanism to get (read) the c urrently executing coroutinehandle.1292 The function definitions ensures there is a statically-typed @main@ function that is the starting point (first stack frame) of a coroutine, and a mechanism to get (read) the coroutine descriptor from its handle. 1290 1293 The @main@ function has no return value or additional parameters because the coroutine type allows an arbitrary number of interface functions with corresponding arbitrary typed input/output values versus fixed ones. 1291 1294 The advantage of this approach is that users can easily create different types of coroutines, \eg changing the memory layout of a coroutine is trivial when implementing the @get_coroutine@ function, and possibly redefining \textsf{suspend} and @resume@. … … 1493 1496 \end{tabular} 1494 1497 \end{cquote} 1495 Like coroutines, the @dtype@ property prevents \emph{implicit} copy operations and the @is_ coroutine@ trait provides no \emph{explicit} copy operations, so threads must be passed by reference (pointer).1496 Similarly, the function definitions ensures there is a statically-typed @main@ function that is the thread starting point (first stack frame), a mechanism to get (read) the currently executing threadhandle, and a special destructor to prevent deallocation while the thread is executing.1498 Like coroutines, the @dtype@ property prevents \emph{implicit} copy operations and the @is_thread@ trait provides no \emph{explicit} copy operations, so threads must be passed by reference (pointer). 1499 Similarly, the function definitions ensures there is a statically-typed @main@ function that is the thread starting point (first stack frame), a mechanism to get (read) the thread descriptor from its handle, and a special destructor to prevent deallocation while the thread is executing. 1497 1500 (The qualifier @mutex@ for the destructor parameter is discussed in Section~\ref{s:Monitor}.) 1498 1501 The difference between the coroutine and thread is that a coroutine borrows a thread from its caller, so the first thread resuming a coroutine creates the coroutine's stack and starts running the coroutine main on the stack; … … 1617 1620 % Copying a lock is insecure because it is possible to copy an open lock and then use the open copy when the original lock is closed to simultaneously access the shared data. 1618 1621 % Copying a monitor is secure because both the lock and shared data are copies, but copying the shared data is meaningless because it no longer represents a unique entity. 1619 Similarly, the function definitions ensures there is a mechanism to get (read) the currently executing monitorhandle, and a special destructor to prevent deallocation if a thread using the shared data.1622 Similarly, the function definitions ensures there is a mechanism to get (read) the monitor descriptor from its handle, and a special destructor to prevent deallocation if a thread using the shared data. 1620 1623 The custom monitor type also inserts any locks needed to implement the mutual exclusion semantics. 1621 1624 … … 1656 1659 called \newterm{bulk acquire}. 1657 1660 \CFA guarantees acquisition order is consistent across calls to @mutex@ functions using the same monitors as arguments, so acquiring multiple monitors is safe from deadlock. 1658 Figure~\ref{f:BankTransfer} shows a trivial solution to the bank transfer problem , where two resources must be locked simultaneously, using \CFA monitors with implicit locking and \CC with explicit locking.1661 Figure~\ref{f:BankTransfer} shows a trivial solution to the bank transfer problem~\cite{BankTransfer}, where two resources must be locked simultaneously, using \CFA monitors with implicit locking and \CC with explicit locking. 1659 1662 A \CFA programmer only has to manage when to acquire mutual exclusion; 1660 1663 a \CC programmer must select the correct lock and acquisition mechanism from a panoply of locking options. … … 1800 1803 Figure~\ref{f:MonitorScheduling} shows general internal/external scheduling (for the bounded-buffer example in Figure~\ref{f:InternalExternalScheduling}). 1801 1804 External calling threads block on the calling queue, if the monitor is occupied, otherwise they enter in FIFO order. 1802 Internal threads block on condition queues via @wait@ and they reenter from the condition in FIFO order, or they block on urgent via @signal_block@ or @waitfor@ and reenter implicit when the monitor becomes empty, \ie, the thread in the monitor exits or waits. 1805 Internal threads block on condition queues via @wait@ and reenter from the condition in FIFO order. 1806 Alternatively, internal threads block on urgent from the @signal_block@ or @waitfor@, and reenter implicitly when the monitor becomes empty, \ie, the thread in the monitor exits or waits. 1803 1807 1804 1808 There are three signalling mechanisms to unblock waiting threads to enter the monitor. 1805 Note, signalling cannot have the signaller and signalled thread in the monitor simultaneously because of the mutual exclusion so only one can proceed.1809 Note, signalling cannot have the signaller and signalled thread in the monitor simultaneously because of the mutual exclusion, so either the signaller or signallee can proceed. 1806 1810 For internal scheduling, threads are unblocked from condition queues using @signal@, where the signallee is moved to urgent and the signaller continues (solid line). 1807 1811 Multiple signals move multiple signallees to urgent, until the condition is empty. … … 1843 1847 It is common to declare condition variables as monitor fields to prevent shared access, hence no locking is required for access as the conditions are protected by the monitor lock. 1844 1848 In \CFA, a condition variable can be created/stored independently. 1845 To still prevent expensive locking on access, a condition variable is tied to a \emph{group} of monitors on first use, called \newterm{branding}, resulting in a low-cost boolen test to detect sharing from other monitors.1849 % To still prevent expensive locking on access, a condition variable is tied to a \emph{group} of monitors on first use, called \newterm{branding}, resulting in a low-cost boolean test to detect sharing from other monitors. 1846 1850 1847 1851 % Signalling semantics cannot have the signaller and signalled thread in the monitor simultaneously, which means: … … 1852 1856 % The signalling thread continues and the signalled thread is marked for urgent unblocking at the next scheduling point (exit/wait). 1853 1857 % \item 1854 % The signalling thread blocks but is marked for urg rent unblocking at the next scheduling point and the signalled thread continues.1858 % The signalling thread blocks but is marked for urgent unblocking at the next scheduling point and the signalled thread continues. 1855 1859 % \end{enumerate} 1856 1860 % The first approach is too restrictive, as it precludes solving a reasonable class of problems, \eg dating service (see Figure~\ref{f:DatingService}). … … 1961 1965 External scheduling is controlled by the @waitfor@ statement, which atomically blocks the calling thread, releases the monitor lock, and restricts the function calls that can next acquire mutual exclusion. 1962 1966 If the buffer is full, only calls to @remove@ can acquire the buffer, and if the buffer is empty, only calls to @insert@ can acquire the buffer. 1963 Calls threads to functions that are currently excludedblock outside of (external to) the monitor on the calling queue, versus blocking on condition queues inside of (internal to) the monitor.1967 Threads calling excluded functions block outside of (external to) the monitor on the calling queue, versus blocking on condition queues inside of (internal to) the monitor. 1964 1968 Figure~\ref{f:RWExt} shows a readers/writer lock written using external scheduling, where a waiting reader detects a writer using the resource and restricts further calls until the writer exits by calling @EndWrite@. 1965 1969 The writer does a similar action for each reader or writer using the resource. … … 2076 2080 For @wait( e )@, the default semantics is to atomically block the signaller and release all acquired mutex parameters, \ie @wait( e, m1, m2 )@. 2077 2081 To override the implicit multi-monitor wait, specific mutex parameter(s) can be specified, \eg @wait( e, m1 )@. 2078 Wait statically verifies the released monitors are the acquired mutex-parameters so unconditional release is safe.2082 Wait cannot statically verifies the released monitors are the acquired mutex-parameters without disallowing separately compiled helper functions calling @wait@. 2079 2083 While \CC supports bulk locking, @wait@ only accepts a single lock for a condition variable, so bulk locking with condition variables is asymmetric. 2080 2084 Finally, a signaller, … … 2088 2092 Similarly, for @waitfor( rtn )@, the default semantics is to atomically block the acceptor and release all acquired mutex parameters, \ie @waitfor( rtn, m1, m2 )@. 2089 2093 To override the implicit multi-monitor wait, specific mutex parameter(s) can be specified, \eg @waitfor( rtn, m1 )@. 2090 @waitfor@ statically verifies the released monitors are the same as the acquired mutex-parameters of the given function or function pointer. 2091 To statically verify the released monitors match with the accepted function's mutex parameters, the function (pointer) prototype must be accessible. 2094 @waitfor@ does statically verify the monitor types passed are the same as the acquired mutex-parameters of the given function or function pointer, hence the function (pointer) prototype must be accessible. 2092 2095 % When an overloaded function appears in an @waitfor@ statement, calls to any function with that name are accepted. 2093 2096 % The rationale is that members with the same name should perform a similar function, and therefore, all should be eligible to accept a call. … … 2148 2151 The right example accepts either @mem1@ or @mem2@ if @C1@ and @C2@ are true. 2149 2152 2150 An interesting use of @waitfor@ is accepting the @mutex@ destructor to know when an object is deallocated. 2151 \begin{cfa} 2152 void insert( Buffer(T) & mutex buffer, T elem ) with( buffer ) { 2153 if ( count == 10 ) 2154 waitfor( remove, buffer ) { 2155 // insert elem into buffer 2156 } or `waitfor( ^?{}, buffer )` throw insertFail; 2157 } 2158 \end{cfa} 2159 When the buffer is deallocated, the current waiter is unblocked and informed, so it can perform an appropriate action. 2160 However, the basic @waitfor@ semantics do not support this functionality, since using an object after its destructor is called is undefined. 2161 Therefore, to make this useful capability work, the semantics for accepting the destructor is the same as @signal@, \ie the call to the destructor is placed on the urgent queue and the acceptor continues execution, which throws an exception to the acceptor and then the caller is unblocked from the urgent queue to deallocate the object. 2162 Accepting the destructor is the idiomatic way to terminate a thread in \CFA. 2153 An interesting use of @waitfor@ is accepting the @mutex@ destructor to know when an object is deallocated, \eg assume the bounded buffer is restructred from a monitor to a thread with the following @main@. 2154 \begin{cfa} 2155 void main( Buffer(T) & buffer ) with(buffer) { 2156 for () { 2157 `waitfor( ^?{}, buffer )` break; 2158 or when ( count != 20 ) waitfor( insert, buffer ) { ... } 2159 or when ( count != 0 ) waitfor( remove, buffer ) { ... } 2160 } 2161 // clean up 2162 } 2163 \end{cfa} 2164 When the program main deallocates the buffer, it first calls the buffer's destructor, which is accepted, the destructor runs, and the buffer is deallocated. 2165 However, the buffer thread cannot continue after the destructor call because the object is gone; 2166 hence, clean up in @main@ cannot occur, which means destructors for local objects are not run. 2167 To make this useful capability work, the semantics for accepting the destructor is the same as @signal@, \ie the destructor call is placed on urgent and the acceptor continues execution, which ends the loop, cleans up, and the thread terminates. 2168 Then, the destructor caller unblocks from urgent to deallocate the object. 2169 Accepting the destructor is the idiomatic way in \CFA to terminate a thread performing direct communication. 2163 2170 2164 2171 … … 2357 2364 2358 2365 struct Msg { int i, j; }; 2359 thread Go rtn { int i; float f; Msg m; };2360 void mem1( Go rtn & mutex gortn, int i ) { gortn.i = i; }2361 void mem2( Go rtn & mutex gortn, float f ) { gortn.f = f; }2362 void mem3( Go rtn & mutex gortn, Msg m ) { gortn.m = m; }2363 void ^?{}( Go rtn & mutex ) {}2364 2365 void main( Go rtn & gortn ) with( gortn ) { // thread starts2366 thread GoRtn { int i; float f; Msg m; }; 2367 void mem1( GoRtn & mutex gortn, int i ) { gortn.i = i; } 2368 void mem2( GoRtn & mutex gortn, float f ) { gortn.f = f; } 2369 void mem3( GoRtn & mutex gortn, Msg m ) { gortn.m = m; } 2370 void ^?{}( GoRtn & mutex ) {} 2371 2372 void main( GoRtn & gortn ) with( gortn ) { // thread starts 2366 2373 2367 2374 for () { … … 2376 2383 } 2377 2384 int main() { 2378 Go rtn gortn; $\C[2.0in]{// start thread}$2385 GoRtn gortn; $\C[2.0in]{// start thread}$ 2379 2386 `mem1( gortn, 0 );` $\C{// different calls}\CRT$ 2380 2387 `mem2( gortn, 2.5 );` … … 2534 2541 % However, preemption is necessary for fairness and to reduce tail-latency. 2535 2542 % For concurrency that relies on spinning, if all cores spin the system is livelocked, whereas preemption breaks the livelock. 2536 % 2537 % 2538 % \subsection{Thread Pools} 2539 % 2540 % In contrast to direct threading is indirect \newterm{thread pools}, \eg Java @executor@, where small jobs (work units) are inserted into a work pool for execution. 2541 % If the jobs are dependent, \ie interact, there is an implicit/explicit dependency graph that ties them together. 2542 % While removing direct concurrency, and hence the amount of context switching, thread pools significantly limit the interaction that can occur among jobs. 2543 % Indeed, jobs should not block because that also blocks the underlying thread, which effectively means the CPU utilization, and therefore throughput, suffers. 2544 % While it is possible to tune the thread pool with sufficient threads, it becomes difficult to obtain high throughput and good core utilization as job interaction increases. 2545 % As well, concurrency errors return, which threads pools are suppose to mitigate. 2543 2544 2545 \begin{comment} 2546 \subsection{Thread Pools} 2547 2548 In contrast to direct threading is indirect \newterm{thread pools}, \eg Java @executor@, where small jobs (work units) are inserted into a work pool for execution. 2549 If the jobs are dependent, \ie interact, there is an implicit/explicit dependency graph that ties them together. 2550 While removing direct concurrency, and hence the amount of context switching, thread pools significantly limit the interaction that can occur among jobs. 2551 Indeed, jobs should not block because that also blocks the underlying thread, which effectively means the CPU utilization, and therefore throughput, suffers. 2552 While it is possible to tune the thread pool with sufficient threads, it becomes difficult to obtain high throughput and good core utilization as job interaction increases. 2553 As well, concurrency errors return, which threads pools are suppose to mitigate. 2554 2555 \begin{figure} 2556 \centering 2557 \begin{tabular}{@{}l|l@{}} 2558 \begin{cfa} 2559 struct Adder { 2560 int * row, cols; 2561 }; 2562 int operator()() { 2563 subtotal = 0; 2564 for ( int c = 0; c < cols; c += 1 ) 2565 subtotal += row[c]; 2566 return subtotal; 2567 } 2568 void ?{}( Adder * adder, int row[$\,$], int cols, int & subtotal ) { 2569 adder.[rows, cols, subtotal] = [rows, cols, subtotal]; 2570 } 2571 2572 2573 2574 2575 \end{cfa} 2576 & 2577 \begin{cfa} 2578 int main() { 2579 const int rows = 10, cols = 10; 2580 int matrix[rows][cols], subtotals[rows], total = 0; 2581 // read matrix 2582 Executor executor( 4 ); // kernel threads 2583 Adder * adders[rows]; 2584 for ( r; rows ) { // send off work for executor 2585 adders[r] = new( matrix[r], cols, &subtotal[r] ); 2586 executor.send( *adders[r] ); 2587 } 2588 for ( r; rows ) { // wait for results 2589 delete( adders[r] ); 2590 total += subtotals[r]; 2591 } 2592 sout | total; 2593 } 2594 \end{cfa} 2595 \end{tabular} 2596 \caption{Executor} 2597 \end{figure} 2598 \end{comment} 2546 2599 2547 2600 … … 2567 2620 The purpose of a cluster is to control the amount of parallelism that is possible among threads, plus scheduling and other execution defaults. 2568 2621 The default cluster-scheduler is single-queue multi-server, which provides automatic load-balancing of threads on processors. 2569 However, the scheduler is pluggable, supporting alternative schedulers, such as multi-queue multi-server,with work-stealing/sharing across the virtual processors.2622 However, the design allows changing the scheduler, \eg multi-queue multi-server with work-stealing/sharing across the virtual processors. 2570 2623 If several clusters exist, both threads and virtual processors, can be explicitly migrated from one cluster to another. 2571 2624 No automatic load balancing among clusters is performed by \CFA. … … 2574 2627 The user cluster is created to contain the application user-threads. 2575 2628 Having all threads execute on the one cluster often maximizes utilization of processors, which minimizes runtime. 2576 However, because of limitations of the underlying operating system, heterogeneous hardware, or scheduling requirements (real-time), multiple clusters are sometimes necessary.2629 However, because of limitations of scheduling requirements (real-time), NUMA architecture, heterogeneous hardware, or issues with the underlying operating system, multiple clusters are sometimes necessary. 2577 2630 2578 2631 … … 2618 2671 \subsection{Preemption} 2619 2672 2620 Nondeterministic preemption provides fairness from long running threads, and forces concurrent programmers to write more robust programs, rather than relying on section of code between cooperative scheduling to be atomic. 2621 A separate reason for not supporting preemption is that it significantly complicates the runtime system. 2673 Nondeterministic preemption provides fairness from long running threads, and forces concurrent programmers to write more robust programs, rather than relying on code between cooperative scheduling to be atomic. 2674 This atomic reliance can fail on multi-core machines, because execution across cores is nondeterministic. 2675 A different reason for not supporting preemption is that it significantly complicates the runtime system, \eg Microsoft runtime does not support interrupts and on Linux systems, interrupts are complex (see below). 2622 2676 Preemption is normally handled by setting a count-down timer on each virtual processor. 2623 2677 When the timer expires, an interrupt is delivered, and the interrupt handler resets the count-down timer, and if the virtual processor is executing in user code, the signal handler performs a user-level context-switch, or if executing in the language runtime-kernel, the preemption is ignored or rolled forward to the point where the runtime kernel context switches back to user code. … … 2628 2682 Because preemption frequency is usually long (1 millisecond) performance cost is negligible. 2629 2683 2630 However, on current Linux systems: 2684 Linux switched a decade ago from specific to arbitrary process signal-delivery for applications with multiple kernel threads. 2631 2685 \begin{cquote} 2632 2686 A process-directed signal may be delivered to any one of the threads that does not currently have the signal blocked. … … 2634 2688 SIGNAL(7) - Linux Programmer's Manual 2635 2689 \end{cquote} 2636 Hence, the timer-expiry signal, which is generated \emph{externally} by the Linux kernel to the Linux process, is delivered to any of its Linux subprocesses (kernel threads).2637 To ensure each virtual processor receives its own preemption signals, a discrete-event simulation is run on a special virtual processor, and only it sets and receives timer events.2690 Hence, the timer-expiry signal, which is generated \emph{externally} by the Linux kernel to an application, is delivered to any of its Linux subprocesses (kernel threads). 2691 To ensure each virtual processor receives a preemption signal, a discrete-event simulation is run on a special virtual processor, and only it sets and receives timer events. 2638 2692 Virtual processors register an expiration time with the discrete-event simulator, which is inserted in sorted order. 2639 2693 The simulation sets the count-down timer to the value at the head of the event list, and when the timer expires, all events less than or equal to the current time are processed. … … 2652 2706 2653 2707 To verify the implementation of the \CFA runtime, a series of microbenchmarks are performed comparing \CFA with Java OpenJDK-9, Go 1.9.2 and \uC 7.0.0. 2654 The benchmark computer is an AMD Opteron\texttrademark\ 6380 NUMA 64-core, 8 socket, 2.5 GHz processor, running Ubuntu 16.04. 3 LTS and \uC and \CFA are compiled with gcc 6.3.2708 The benchmark computer is an AMD Opteron\texttrademark\ 6380 NUMA 64-core, 8 socket, 2.5 GHz processor, running Ubuntu 16.04.6 LTS, and \uC/\CFA are compiled with gcc 6.5. 2655 2709 2656 2710 \begin{comment} … … 2707 2761 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2708 2762 \begin{cfa} 2709 threadMyThread {};2710 void main( MyThread & ) {}2763 @thread@ MyThread {}; 2764 void @main@( MyThread & ) {} 2711 2765 int main() { 2712 2766 BENCH( for ( N ) { @MyThread m;@ } ) … … 2750 2804 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2751 2805 \begin{cfa}[aboveskip=0pt,belowskip=0pt] 2752 coroutineC {} c;2806 @coroutine@ C {} c; 2753 2807 void main( C & ) { for ( ;; ) { @suspend;@ } } 2754 2808 int main() { // coroutine test … … 2771 2825 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 2772 2826 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2773 Kernel Thread & 333.5 & 332.96 & 4.1 \\ 2827 C function & 2 & 2 & 0 \\ 2828 \CFA generator & 2 & 2 & 0 \\ 2774 2829 \CFA Coroutine & 49 & 48.68 & 0.47 \\ 2775 2830 \CFA Thread & 105 & 105.57 & 1.37 \\ … … 2777 2832 \uC Thread & 100 & 99.29 & 0.96 \\ 2778 2833 Goroutine & 145 & 147.25 & 4.15 \\ 2779 Java Thread & 373.5 & 375.14 & 8.72 2834 Java Thread & 373.5 & 375.14 & 8.72 \\ 2835 Pthreads Thread & 333.5 & 332.96 & 4.1 2780 2836 \end{tabular} 2781 2837 \end{multicols} … … 2793 2849 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2794 2850 \begin{cfa} 2795 monitorM {} m1/*, m2, m3, m4*/;2851 @monitor@ M {} m1/*, m2, m3, m4*/; 2796 2852 void __attribute__((noinline)) 2797 do_call( M & mutex m/*, m2, m3, m4*/) {}2853 do_call( M & @mutex m/*, m2, m3, m4*/@ ) {} 2798 2854 int main() { 2799 2855 BENCH( 2800 for( N ) @do_call( m1/*, m2, m3, m4*/ );@2856 for( N ) do_call( m1/*, m2, m3, m4*/ ); 2801 2857 ) 2802 2858 sout | result`ns; … … 2813 2869 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 2814 2870 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2815 C function & 2 & 2 & 0 \\ 2816 FetchAdd + FetchSub & 26 & 26 & 0 \\ 2871 test and test-and-test lock & 26 & 26 & 0 \\ 2817 2872 Pthreads Mutex Lock & 31 & 31.71 & 0.97 \\ 2818 2873 \uC @monitor@ member rtn. & 31 & 31 & 0 \\ … … 2820 2875 \CFA @mutex@ function, 2 arg. & 84 & 85.36 & 1.99 \\ 2821 2876 \CFA @mutex@ function, 4 arg. & 158 & 161 & 4.22 \\ 2822 Java synchronized function& 27.5 & 29.79 & 2.932877 Java synchronized method & 27.5 & 29.79 & 2.93 2823 2878 \end{tabular} 2824 2879 \end{multicols} … … 2836 2891 \begin{cfa} 2837 2892 volatile int go = 0; 2838 monitor M { condition c;} m;2893 @monitor@ M { @condition c;@ } m; 2839 2894 void __attribute__((noinline)) 2840 do_call( M & mutexa1 ) { @signal( c );@ }2895 do_call( M & @mutex@ a1 ) { @signal( c );@ } 2841 2896 thread T {}; 2842 2897 void main( T & this ) { … … 2869 2924 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2870 2925 Pthreads Cond. Variable & 6005 & 5681.43 & 835.45 \\ 2871 \uC @signal@ & 324 & 325.54 & 3 ,02 \\2926 \uC @signal@ & 324 & 325.54 & 3.02 \\ 2872 2927 \CFA @signal@, 1 @monitor@ & 368.5 & 370.61 & 4.77 \\ 2873 2928 \CFA @signal@, 2 @monitor@ & 467 & 470.5 & 6.79 \\ … … 2889 2944 \begin{cfa} 2890 2945 volatile int go = 0; 2891 monitorM {} m;2946 @monitor@ M {} m; 2892 2947 thread T {}; 2893 2948 void __attribute__((noinline)) 2894 do_call( M & mutex) {}2949 do_call( M & @mutex@ ) {} 2895 2950 void main( T & ) { 2896 2951 while ( go == 0 ) { yield(); } 2897 while ( go == 1 ) { @do_call( m );@}2952 while ( go == 1 ) { do_call( m ); } 2898 2953 } 2899 2954 int __attribute__((noinline)) 2900 do_wait( M & mutexm ) {2955 do_wait( M & @mutex@ m ) { 2901 2956 go = 1; // continue other thread 2902 2957 BENCH( for ( N ) { @waitfor( do_call, m );@ } ) -
doc/papers/concurrency/annex/local.bib
rf2f22e3 r8f079f0 66 66 } 67 67 68 @ article{BankTransfer,68 @misc{BankTransfer, 69 69 key = {Bank Transfer}, 70 70 keywords = {Bank Transfer}, 71 71 title = {Bank Account Transfer Problem}, 72 publisher = {Wiki Wiki Web}, 73 address = {http://wiki.c2.com}, 72 howpublished = {Wiki Wiki Web, \url{http://wiki.c2.com/?BankAccountTransferProblem}}, 74 73 year = 2010 75 74 }
Note: See TracChangeset
for help on using the changeset viewer.