Changeset 210c737
- Timestamp:
- Aug 1, 2023, 10:26:10 PM (21 months ago)
- Branches:
- master
- Children:
- 2cb15b0, d5f5eb7
- Parents:
- 4852232 (diff), afb3d68 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)
links above to see all the changes relative to each parent. - Files:
-
- 11 added
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
TabularUnified doc/theses/colby_parsons_MMAth/local.bib ¶
r4852232 r210c737 108 108 } 109 109 110 @misc{java:allof:anyof, 111 author = "Java Util Concurrent", 112 title = "Class CompletableFuture", 113 howpublished = {\url{https://docs.oracle.com/javase/8/docs/api/java/util/concurrent/CompletableFuture.html}}, 114 note = "[Online; accessed 23-May-2023]" 115 } 116 110 117 @misc{ocaml:channel, 111 118 author = "The OCaml Manual", … … 202 209 version = {Version 080}, 203 210 organization= {Intel}, 204 month = mar ch,211 month = mar, 205 212 year = 2023, 206 213 } -
TabularUnified doc/theses/colby_parsons_MMAth/text/actors.tex ¶
r4852232 r210c737 1365 1365 \end{figure} 1366 1366 1367 \begin{figure} 1368 \centering 1369 \subfloat[AMD \CFA Matrix Benchmark]{ 1370 \resizebox{0.5\textwidth}{!}{\input{figures/nasusCFAMatrix.pgf}} 1371 \label{f:cfaMatrixAMD} 1372 } 1373 \subfloat[Intel \CFA Matrix Benchmark]{ 1374 \resizebox{0.5\textwidth}{!}{\input{figures/pykeCFAMatrix.pgf}} 1375 \label{f:cfaMatrixIntel} 1376 } 1377 \caption{The matrix benchmark comparing \CFA stealing heuristics (lower is better).} 1378 \end{figure} 1379 1367 1380 Figures~\ref{f:cfaRepeatAMD} and~\ref{f:cfaRepeatIntel} show the effects of the stealing heuristics for the repeat benchmark. 1368 1381 This benchmark is a pathological case for work stealing actor systems, as the majority of work is being performed by the single actor conducting the scatter/gather. … … 1389 1402 Hence, it is difficult to attribute the AMD gain to the aggressive work stealing in CAF. 1390 1403 1391 \begin{figure}1392 \centering1393 \subfloat[AMD \CFA Matrix Benchmark]{1394 \resizebox{0.5\textwidth}{!}{\input{figures/nasusCFAMatrix.pgf}}1395 \label{f:cfaMatrixAMD}1396 }1397 \subfloat[Intel \CFA Matrix Benchmark]{1398 \resizebox{0.5\textwidth}{!}{\input{figures/pykeCFAMatrix.pgf}}1399 \label{f:cfaMatrixIntel}1400 }1401 \caption{The matrix benchmark comparing \CFA stealing heuristics (lower is better).}1402 \end{figure}1403 1404 1404 % Local Variables: % 1405 1405 % tab-width: 4 % -
TabularUnified doc/theses/colby_parsons_MMAth/text/channels.tex ¶
r4852232 r210c737 196 196 197 197 \section{\CFA / Go channel Examples} 198 To highlight the differences between \CFA's and Go's close semantics, t hreeexamples are presented.198 To highlight the differences between \CFA's and Go's close semantics, two examples are presented. 199 199 The first example is a simple shutdown case, where there are producer threads and consumer threads operating on a channel for a fixed duration. 200 200 Once the duration ends, producers and consumers terminate immediately leaving unprocessed elements in the channel. -
TabularUnified doc/theses/colby_parsons_MMAth/text/frontpgs.tex ¶
r4852232 r210c737 80 80 Concurrent programs are notoriously hard to program and even harder to debug. Furthermore concurrent programs must be performant, as the introduction of concurrency into a program is often done to achieve some form of speedup. 81 81 82 This thesis presents a suite of high-level concurrent-language features in the new programming -language \CFA, all of which are implemented with the aim of improving the performance, productivity, and safety of concurrent programs.82 This thesis presents a suite of high-level concurrent-language features in the new programming language \CFA, all of which are implemented with the aim of improving the performance, productivity, and safety of concurrent programs. 83 83 \CFA is a non-object-oriented programming language that extends C. 84 84 The foundation for concurrency in \CFA was laid by Thierry Delisle~\cite{Delisle18}, who implemented coroutines, user-level threads, and monitors. -
TabularUnified doc/theses/colby_parsons_MMAth/text/mutex_stmt.tex ¶
r4852232 r210c737 350 350 351 351 The benchmark used to evaluate the avoidance algorithms repeatedly acquires a fixed number of locks in a random order and then releases them. 352 The pseudocode for the deadlock avoidance benchmark is shown in \VRef[ Listing]{l:deadlock_avoid_pseudo}.352 The pseudocode for the deadlock avoidance benchmark is shown in \VRef[Figure]{l:deadlock_avoid_pseudo}. 353 353 To ensure the comparison exercises the implementation of each lock avoidance algorithm, an identical spinlock is implemented in each language using a set of builtin atomics available in both \CC and \CFA. 354 354 The benchmarks are run for a fixed duration of 10 seconds and then terminate. … … 357 357 The median is calculated and is plotted alongside the 95\% confidence intervals for each point. 358 358 359 \begin{ cfa}[caption={Deadlock avoidance benchmark pseudocode},label={l:deadlock_avoid_pseudo}]360 359 \begin{figure} 360 \begin{cfa} 361 361 size_t n_locks; $\C{// number of locks}$ 362 362 size_t n_thds; $\C{// number of threads}$ … … 387 387 printf( "%lu\n", total ); 388 388 } 389 390 \end{cfa} 389 \end{cfa} 390 \caption{Deadlock avoidance benchmark pseudocode} 391 \label{l:deadlock_avoid_pseudo} 392 \end{figure} 391 393 392 394 The performance experiments were run on the following multi-core hardware systems to determine differences across platforms: -
TabularUnified doc/theses/colby_parsons_MMAth/text/waituntil.tex ¶
r4852232 r210c737 123 123 124 124 \begin{figure} 125 \begin{lstlisting}[language=ada,literate= ]126 task type buffer is -- thread 125 \begin{lstlisting}[language=ada,literate=,{moredelim={**[is][\color{red}]{@}{@}}}] 126 task type buffer is -- thread type 127 127 ... -- buffer declarations 128 128 count : integer := 0; … … 130 130 loop 131 131 select 132 when count < Size=> -- guard133 accept insert( elem : in ElemType )do -- method132 @when count < Size@ => -- guard 133 @accept insert( elem : in ElemType )@ do -- method 134 134 ... -- add to buffer 135 135 count := count + 1; … … 137 137 -- executed if this accept called 138 138 or 139 when count > 0=> -- guard140 accept remove( elem : out ElemType )do -- method139 @when count > 0@ => -- guard 140 @accept remove( elem : out ElemType )@ do -- method 141 141 ... --remove and return from buffer via parameter 142 142 count := count - 1; 143 143 end; 144 144 -- executed if this accept called 145 or delay 10.0; -- unblock after 10 seconds without call146 or else-- do not block, cannot appear with delay145 or @delay 10.0@; -- unblock after 10 seconds without call 146 or @else@ -- do not block, cannot appear with delay 147 147 end select; 148 148 end loop; … … 222 222 223 223 \begin{lrbox}{\myboxB} 224 \begin{lstlisting}[language=uC++ =]224 \begin{lstlisting}[language=uC++,{moredelim={**[is][\color{red}]{@}{@}}}] 225 225 _Task BoundedBuffer { 226 226 int * buffer; … … 243 243 for ( ;; ) { 244 244 _Accept( ~buffer ) break; 245 or _When ( count < Size ) _Accept( insert );246 or _When ( count > 0 ) _Accept( remove );245 @or _When ( count < Size ) _Accept( insert )@; 246 @or _When ( count > 0 ) _Accept( remove )@; 247 247 } 248 248 } … … 267 267 Figure~\ref{l:BB_uC++} shows the outline of a bounded buffer administrator implemented with \uC @_Accept@ statements. 268 268 269 There are other languages that provide \gls{synch_multiplex}, including Rust's @select!@ over futures~\cite{rust:select}, OCaml's @select@ over channels~\cite{ocaml:channel}, and C++14's @when_any@ over futures~\cite{cpp:whenany}.269 There are other languages that provide \gls{synch_multiplex}, including Rust's @select!@ over futures~\cite{rust:select}, Java's @allof@/@anyof@ over futures~\cite{java:allof:anyof}, OCaml's @select@ over channels~\cite{ocaml:channel}, and C++14's @when_any@ over futures~\cite{cpp:whenany}. 270 270 Note that while C++14 and Rust provide \gls{synch_multiplex}, the implementations leave much to be desired as both rely on polling to wait on multiple resources. 271 271 … … 276 276 Similar, actor systems circumvent the \gls{synch_multiplex} problem as actors only block when waiting for the next message never in a behaviour. 277 277 While these approaches solve the \gls{synch_multiplex} problem, they introduce other issues. 278 Consider the case where a thread has a single source of communication and it wants a set of @N@resources.279 It must sequentially request the @N@resources and wait for each response.280 During the receives for the @N@resources, it can receive other communication, and has to save and postpone these communications, or discard them.278 Consider the case where a thread has a single source of communication and it wants a set of $N$ resources. 279 It must sequentially request the $N$ resources and wait for each response. 280 During the receives for the $N$ resources, it can receive other communication, and has to save and postpone these communications, or discard them. 281 281 % If the requests for the other resources need to be retracted, the burden falls on the programmer to determine how to synchronize appropriately to ensure that only one resource is delivered. 282 282 … … 320 320 int i = 0; 321 321 322 waituntil( Lock) { ... }323 or when( i == 0 ) waituntil( i << Channel) { ... }324 and waituntil( Future) { ... }325 or waituntil( timeout( 1`s )) { ... }322 waituntil( @Lock@ ) { ... } 323 or when( i == 0 ) waituntil( i << @Channel@ ) { ... } 324 and waituntil( @Future@ ) { ... } 325 or waituntil( @timeout( 1`s )@ ) { ... } 326 326 // else { ... } 327 327 \end{cfa} … … 342 342 \begin{cfa} 343 343 future(int) bar, foo; 344 waituntil( foo ) { ... } or waituntil( bar ) { ... } 345 \end{cfa} 346 The reason for this semantics is that prioritizing resources can be useful in certain problems .347 In the rare case where there is a starvation problem with the ordering, it possible to follow a @waituntil@ with its reverse form :344 waituntil( foo ) { ... } or waituntil( bar ) { ... } // prioritize foo 345 \end{cfa} 346 The reason for this semantics is that prioritizing resources can be useful in certain problems, such as shutdown. 347 In the rare case where there is a starvation problem with the ordering, it possible to follow a @waituntil@ with its reverse form, alternating which resource has the highest priority: 348 348 \begin{cfa} 349 349 waituntil( foo ) { ... } or waituntil( bar ) { ... } // prioritize foo 350 350 waituntil( bar ) { ... } or waituntil( foo ) { ... } // prioritize bar 351 351 \end{cfa} 352 While this approach is not general for many resources, it handles many basic cases. 352 353 353 354 The \CFA @and@ semantics match the @and@ semantics of \uC \lstinline[language=uC++]{_Select}. 354 355 When multiple clauses are joined by @and@, the @waituntil@ makes a thread wait for all to be available, but still runs the corresponding code blocks \emph{as they become available}. 355 When an @and@ clause becomes available, the waiting thread unblocks and runs that clause's code-block, and then the thread waits again for the next available clause or the @waituntil@ statement is now true.356 When an @and@ clause becomes available, the waiting thread unblocks and runs that clause's code-block, and then the thread waits again for the next available clause or the @waituntil@ statement is now satisfied. 356 357 This semantics allows work to be done in parallel while synchronizing over a set of resources, and furthermore, gives a good reason to use the @and@ operator. 357 358 If the @and@ operator waited for all clauses to be available before running, it is the same as just acquiring those resources consecutively by a sequence of @waituntil@ statements. … … 415 416 \section{\lstinline{waituntil} Implementation} 416 417 417 The @waituntil@ statement is not inherently complex, and Figure~\ref{f:WU_Impl} onlyshows the basic outline of the @waituntil@ algorithm.418 The complexity comes from the consideration of race conditions and synchronization needed when supporting various primitives.419 The following sections then use examples to fill indetails missing in Figure~\ref{f:WU_Impl}.420 The full pseudocode for the @waituntil@ algorithm is presented in Figure~\ref{f:WU_Full_Impl}.418 The @waituntil@ statement is not inherently complex, and Figure~\ref{f:WU_Impl} shows the basic outline of the @waituntil@ algorithm. 419 Complexity comes from the consideration of race conditions and synchronization needed when supporting various primitives. 420 The following sections use examples to fill in complexity details missing in Figure~\ref{f:WU_Impl}. 421 After which, the full pseudocode for the @waituntil@ algorithm is presented in Figure~\ref{f:WU_Full_Impl}. 421 422 422 423 \begin{figure} … … 477 478 When a @select_node@ reaches the front of the lock's queue and gains ownership, the thread blocked on the @waituntil@ is unblocked. 478 479 Now, the lock is held by the @waituntil@ thread until the code block is executed, and then the node is unregistered, during which the lock is released. 479 Immediately releasing the lock prevents the waiting thread from holding multiple locks and potentially introducing a deadlock.480 Immediately releasing the lock after the code block prevents the waiting thread from holding multiple locks and potentially introducing a deadlock. 480 481 As such, the only unregistered nodes associated with locks are the ones that have not run. 481 482 482 483 \subsection{Timeouts} 483 484 484 A timeout for the @waituntil@ statement is a duration passed to \lstinline[deletekeywords={timeout}]{timeout}, \eg:485 A timeout for the @waituntil@ statement is a duration passed to routine \lstinline[deletekeywords={timeout}]{timeout}\footnote{\lstinline{timeout} is a quasi-keyword in \CFA, allowing it to be used an identifier.}, \eg: 485 486 \begin{cquote} 486 487 \begin{tabular}{@{}l|l@{}} … … 508 509 Here, the multiple timeouts are useful because the durations can change during execution and the separate clauses provide different code blocks if a timeout triggers. 509 510 Multiple timeouts can also be used with @and@ to provide a minimal delay before proceeding. 510 In following example, either channel @C1@ or @C2@ must be satisfied but nothing can be done for at least 1 or 3 seconds after the channel read, resp ctively.511 In following example, either channel @C1@ or @C2@ must be satisfied but nothing can be done for at least 1 or 3 seconds after the channel read, respectively. 511 512 \begin{cfa}[deletekeywords={timeout}] 512 513 waituntil( i << C1 ); and waituntil( timeout( 1`s ) ); 513 514 or waituntil( i << C2 ); and waituntil( timeout( 3`s ) ); 514 515 \end{cfa} 515 If only @C2@ is satisfied, \emph{both} timeout code-blocks trigger because 1 second oc urs before 3 seconds.516 If only @C2@ is satisfied, \emph{both} timeout code-blocks trigger because 1 second occurs before 3 seconds. 516 517 Note, the \CFA @waitfor@ statement only provides a single @timeout@ clause because it only supports @or@ semantics. 517 518 … … 523 524 524 525 Channels require more complexity to allow synchronous multiplexing. 525 For locks, when an outside thread releases a lock and unblocks the waituntilthread (WUT), the lock's MX property is passed to the WUT (no spinning locks).526 For locks, when an outside thread releases a lock and unblocks the @waituntil@ thread (WUT), the lock's MX property is passed to the WUT (no spinning locks). 526 527 For futures, the outside thread deliveries a value to the future and unblocks any waiting threads, including WUTs. 527 In either case, after the WUT unblocks it is safe to execute its thecorresponding code block knowing access to the resource is protected by the lock or the read-only state of the future.528 In either case, after the WUT unblocks, it is safe to execute its corresponding code block knowing access to the resource is protected by the lock or the read-only state of the future. 528 529 Similarly, for channels, when an outside thread inserts a value into a channel, it must unblock the WUT. 529 530 However, for channels, there is a race condition that poses an issue. … … 536 537 Furthermore, if both @and@ and @or@ operators are used, the @or@ operations stop behaving like exclusive-or due to the race among channel operations, \eg: 537 538 \begin{cfa} 539 int i; 538 540 waituntil( i << A ) {} and waituntil( i << B ) {} 539 541 or waituntil( i << C ) {} and waituntil( i << D ) {} 540 542 \end{cfa} 541 543 If exclusive-or semantics are followed, only the code blocks for @A@ and @B@ are run, or the code blocks for @C@ and @D@. 542 However, four outside threads can simultaneously put values into @i@ and attempt to unblock the WUT to run the four code-blocks.544 However, four outside threads inserting into each channel can simultaneously put values into @i@ and attempt to unblock the WUT to run the four code-blocks. 543 545 This case introduces a race with complexity that increases with the size of the @waituntil@ statement. 544 546 However, due to TOCTOU issues, it is impossible to know if all resources are available without acquiring all the internal locks of channels in the subtree of the @waituntil@ clauses. 545 547 This approach is a poor solution for two reasons. 546 It is possible that once all the locks are acquired the subtree is not satisfied and the locks must be released.548 It is possible that once all the locks are acquired, the subtree is not satisfied and the locks must be released. 547 549 This work incurs a high cost for signalling threads and heavily increase contention on internal channel locks. 548 550 Furthermore, the @waituntil@ statement is polymorphic and can support resources that do not have internal locks, which also makes this approach infeasible. 549 551 As such, the exclusive-or semantics are lost when using both @and@ and @or@ operators since it cannot be supported without significant complexity and significantly affects @waituntil@ performance. 552 Therefore, the example of reusing variable @i@ by multiple output channels is considered a user error without exclusive-or semantics. 553 Given aliasing in C, it is impossible to even warn of the potential race. 554 In the future, it would be interesting to support Go-like syntax, \lstinline[language=Go]{case i := <- ...}, defining a new scoped @i@ variable for each clause. 550 555 551 556 It was deemed important that exclusive-or semantics are maintained when only @or@ operators are used, so this situation has been special-cased, and is handled by having all clauses race to set a value \emph{before} operating on the channel. … … 553 558 \begin{cquote} 554 559 \begin{tabular}{@{}l|l|l@{}} 555 \multicolumn{3}{@{}l@{}}{\lstinline{channel A, B; // zero size channels}} \\560 \multicolumn{3}{@{}l@{}}{\lstinline{channel(int) A, B; // zero size channels}} \\ 556 561 thread 1 & thread 2 & thread 3 \\ 557 562 \begin{cfa} … … 582 587 Channels introduce another interesting implementation issue. 583 588 Supporting both reading and writing to a channel in a @waituntil@ means that one @waituntil@ clause may be the notifier of another @waituntil@ clause. 584 This poses a problem when dealing with the special-cased @or@ where the clauses need to win a race to operate on a channel.589 This conjunction poses a problem when dealing with the special-cased @or@ where the clauses need to win a race to operate on a channel. 585 590 Consider the following example, alongside a described ordering of events to highlight the race. 586 591 \begin{cquote} 587 592 \begin{tabular}{@{}l|l@{}} 588 \multicolumn{2}{@{}l@{}}{\lstinline{channel A, B; // zero size channels}} \\593 \multicolumn{2}{@{}l@{}}{\lstinline{channel(int) A, B; // zero size channels}} \\ 589 594 thread 1 & thread 2 \\ 590 595 \begin{cfa}[moredelim={**[is][\color{blue}]{\#}{\#}}] … … 599 604 \end{tabular} 600 605 \end{cquote} 601 Assume thread 1 executes first, registers with channel @A@ and proceeds, since it is empty, and then is interrupted before registering with @B@. 602 Thread 2 similarly registers with channel @B@, and proceeds, since it does not have space to insert, and then is interrupted before registering with @A@. 606 Assume thread 1 executes first, registers with channel @A@ and proceeds in the @waituntil@. 607 Since @A@ is empty, thread 1 cannot remove, and then thread 1 is interrupted before registering with @B@. 608 Thread 2 similarly registers with channel @B@, and proceeds in the @waituntil@. 609 Since @B@ is zero size there is no space to insert, and then thread 2 is interrupted before registering with @A@. 603 610 At this point, thread 1 and 2 resume execution. 604 There is now a race that must be dealt with on two fronts. 605 If thread 1 and 2 only race to the \gls{cas}, \ie a clause in their own @waituntil@, thread 1 can think that it successfully removed from @B@, and thread 2 may think it successfully inserted into @A@, when only one of these operations occurs. 611 Remember from above, each exclusive-or @waituntil@ holds a race to set the winning clause of the statement. 612 The issue that arises is that these two @waituntil@ statements must have matching winning clauses (both @A@ clauses or both @B@ clauses) to preserve the exclusive-or semantics, since a zero-sized channel needs an insert/remove pair for an operation to occur. 613 If threads 1 and 2 race to set a winner only in their own @waituntil@, thread 1 can think it successfully removed from @B@, and thread 2 can think it successfully inserted into @A@, which is an error. 614 Hence, there is a race on two fronts. 615 If thread 1 wins the race and sees that @B@ has a waiting insertion, then thread 2 must execute the first clause of its @waituntil@ and thread 1 execute its second. 616 Correspondingly, if thread 2 wins the race and sees that @A@ has a waiting removal, then thread 1 must execute the first clause of its @waituntil@ and thread 2 execute its second. 617 Any other execution scenario is incorrect for exclusive-or semantics. 618 Note, that priority execution of multiple satisfied @waituntil@ causes (\ie top to bottom) is not violated because, in this scenario, there is only one satisfied clause for either thread. 606 619 607 620 The Go @select@ solves this problem by acquiring all the internal locks of the channels before registering the @select@ on the channels. 608 621 This approach eliminates the race shown above since thread 1 and 2 cannot both be registering at the same time. 609 622 However, this approach cannot be used in \CFA, since the @waituntil@ is polymorphic. 610 Not all types in a @waituntil@ have an internal lock, and when using non-channel types acquiring all the locks incurs extra unneeded overhead.623 Not all types in a @waituntil@ have an internal lock, and when using non-channel types, acquiring all the locks incurs extra unneeded overhead. 611 624 Instead, this race is consolidated in \CFA in two phases by having an intermediate pending status value for the race. 612 625 This race case is detectable, and if detected, each thread first races to set its own @waituntil@ race pointer to be pending. … … 626 639 627 640 // Try to set other status, if we succeed break and return true 628 while( ! CAS( other.clause_status, &cmp_status, SAT ) ) {641 while( ! CAS( other.clause_status, &cmp_status, SAT ) ) { 629 642 if ( cmp_status == SAT ) 630 643 return false; // If other status is SAT we lost so return false … … 637 650 638 651 // Attempt to set own status flag back to PENDING to retry 639 if ( ! CAS( mine.clause_status, &cmp_status, PENDING ) )652 if ( ! CAS( mine.clause_status, &cmp_status, PENDING ) ) 640 653 return false; // If we fail then we lost so return false 641 654 … … 653 666 These exception mechanisms are supported through the @on_selected@ routine. 654 667 This routine is needed by channels to detect if they are closed after unblocking in a @waituntil@ statement, to ensure the appropriate behaviour is taken and an exception is thrown. 668 Hence, the channel close-down mechanism is handled correctly. 655 669 656 670 \subsection{Guards and Statement Predicate}\label{s:wu_guards} … … 658 672 It is trivial to check when a synchronous multiplexing utility is done for the or/xor relationship, since any resource becoming available means that the blocked thread can proceed and the @waituntil@ statement is finished. 659 673 In \uC and \CFA, the \gls{synch_multiplex} mechanism have both an and/or relationship, which along with guards, make the problem of checking for completion of the statement difficult. 660 Consider the @waituntil@ in Figure~\ref{f:WU_ComplexPredicate}. 661 When the @waituntil@ thread wakes up, checking if the statement is complete is non-trivial. 662 The predicate that will return if the statement in Figure~\ref{f:WU_ComplexPredicate} is satisfied is the following. 663 \begin{cfa} 664 A && B || C || !GA && B || !GB && A || !GA && !GB && !GC 665 \end{cfa} 666 Which simplifies to: 667 \begin{cfa} 668 ( A || !GA ) && ( B || !GB ) || C || !GA && !GB && !GC 669 \end{cfa} 670 Checking a predicate this large with each iteration is expensive so \uC and \CFA both take steps to simplify checking statement completion. 671 672 \begin{figure} 674 Consider the following @waituntil@. 673 675 \begin{cfa} 674 676 when( GA ) waituntil( A ) {} … … 676 678 or when( GC ) waituntil( C ) {} 677 679 \end{cfa} 678 \caption{\lstinline{waituntil} with a non-trivial predicate} 679 \label{f:WU_ComplexPredicate} 680 \end{figure} 680 When the @waituntil@ thread wakes up, the following predicate represents satisfaction: 681 \begin{cfa} 682 A && B || C || ! GA && B || ! GB && A || ! GA && ! GB && ! GC 683 \end{cfa} 684 which can be simplified to: 685 \begin{cfa} 686 ( A || ! GA ) && ( B || ! GB ) || C || ! GA && ! GB && ! GC 687 \end{cfa} 688 Checking the complete predicate on each iteration of the pending @waituntil@ is expensive so \uC and \CFA both take steps to simplify checking for statement completion. 681 689 682 690 In the \uC @_Select@ statement, this problem is solved by constructing a tree of the resources, where the internal nodes are operators and the leaves are booleans storing the state of each resource. 683 The internal nodes also store the statuses of the two subtrees beneath them. 691 A diagram of the tree for the complete predicate above is shown in Figure~\ref{f:uC_select_tree}, alongside the modification of the tree that occurs when @GA@ is @false@. 692 Each internal node stores the statuses of the two subtrees beneath it. 684 693 When resources become available, their corresponding leaf node status is modified, which percolates up the tree to update the state of the statement. 685 694 Once the root of the tree has both subtrees marked as @true@ then the statement is complete. 686 695 As an optimization, when the internal nodes are updated, the subtrees marked as @true@ are pruned and not examined again. 687 696 To support statement guards in \uC, the tree is modified to remove an internal node if a guard is false to maintain the appropriate predicate representation. 688 An diagram of the tree for the statement in Figure~\ref{f:WU_ComplexPredicate} is shown in Figure~\ref{f:uC_select_tree}, alongside the modification of the tree that occurs when @GA@ is @false@.689 697 690 698 \begin{figure} … … 692 700 \input{diagrams/uCpp_select_tree.tikz} 693 701 \end{center} 694 \caption{\uC selecttree modification}702 \caption{\uC \lstinline[language=uC++]{select} tree modification} 695 703 \label{f:uC_select_tree} 696 704 \end{figure} 697 705 698 The \CFA @waituntil@ statement blocks a thread until a set of resources have become available that satisfy the underlying predicate.699 The waiting condition of the @waituntil@ statement can be represented as apredicate over the resources, joined by the @waituntil@ operators, where a resource is @true@ if it is available, and @false@ otherwise.700 In \CFA, this representation is used as the mechanism to check if a thread is done waiting on the@waituntil@.706 The \CFA @waituntil@ statement blocks a thread until a set of resources have become available that satisfy the complete predicate of a @waituntil@. 707 The waiting condition of the @waituntil@ statement is implemented as the complete predicate over the resources, joined by the @waituntil@ operators, where a resource is @true@ if it is available, and @false@ otherwise. 708 This complete predicate is used as the mechanism to check if a thread is done waiting on a @waituntil@. 701 709 Leveraging the compiler, a predicate routine is generated per @waituntil@ that when passes the statuses of the resources, returns @true@ when the @waituntil@ is done, and false otherwise. 702 710 To support guards on the \CFA @waituntil@ statement, the status of a resource disabled by a guard is set to a boolean value that ensures that the predicate function behaves as if that resource is no longer part of the predicate. 703 711 The generated code allows the predicate that is checked with each iteration to be simplified to not check guard values. 704 For example, the following would be generated for the @waituntil@ shown in Figure~\ref{f:WU_ComplexPredicate}.712 For example, the following is generated for the complete predicate above: 705 713 \begin{cfa} 706 714 // statement completion predicate … … 708 716 return nodes[0].status && nodes[1].status || nodes[2].status; 709 717 } 710 711 // skip statement if all guards false 712 if ( GA || GB || GC ) { 718 if ( GA || GB || GC ) { $\C{// skip statement if all guards false}$ 713 719 select_node nodes[3]; 714 nodes[0].status = !GA && GB; // A's status 715 nodes[1].status = !GB && GA; // B's status 716 nodes[2].status = !GC; // C's status 717 720 nodes[0].status = ! GA && GB; $\C{// A's status}$ 721 nodes[1].status = ! GB && GA; $\C{// B's status}$ 722 nodes[2].status = ! GC; $\C{// C's status}$ 718 723 // ... rest of waituntil codegen ... 719 720 724 } 721 725 \end{cfa} … … 723 727 \uC's @_Select@, supports operators both inside and outside of the \lstinline[language=uC++]{_Select} clauses. 724 728 In the following example, the code blocks run once their corresponding predicate inside the round braces is satisfied. 725 726 729 \begin{lstlisting}[language=uC++,{moredelim=**[is][\color{red}]{@}{@}}] 727 730 Future_ISM<int> A, B, C, D; … … 729 732 and _Select( @D && E@ ) { ... } 730 733 \end{lstlisting} 731 This is more expressive that the @waituntil@ statement in \CFA, allowing the code block for @&&@ to only run after bothresources are available.734 This feature is more expressive than the @waituntil@ statement in \CFA, allowing the code block for @&&@ to only run after \emph{both} resources are available. 732 735 733 736 In \CFA, since the @waituntil@ statement supports more resources than just futures, implementing operators inside clauses is avoided for a few reasons. … … 738 741 or waituntil( C && D ) { ... } 739 742 \end{cfa} 740 If the @waituntil@ acquires each lock as it becomes available, there is a possible deadlock since it is in a hold andwait situation.743 If the @waituntil@ acquires each lock as it becomes available, there is a possible deadlock since it is in a hold-and-wait situation. 741 744 Other semantics are needed to ensure this operation is safe. 742 745 One possibility is to use \CC's @scoped_lock@ approach described in Section~\ref{s:DeadlockAvoidance}; … … 744 747 Another possibility is to use resource ordering similar to \CFA's @mutex@ statement, but that alone is insufficient, if the resource ordering is not used universally. 745 748 One other way this could be implemented is to wait until all resources for a given clause are available before proceeding to acquire them, but this also quickly becomes a poor approach. 746 This approach does not work due to \gls{toctou} issues; 747 it is impossible to ensure that the full set of resources are available without holding them all first. 748 Operators inside clauses in \CFA could potentially be implemented with careful circumvention of the problems involved, but it was not deemed an important feature when taking into account the runtime cost need paid to handle this situation. 749 The problem of operators inside clauses also becomes a difficult issue to handle when supporting channels. 750 It would require some way to ensure channels used with internal operators are modified, if and only if, the corresponding code block is run, but that is not feasible due to reasons described in the exclusive-or portion of Section~\ref{s:wu_chans}. 749 This approach does not work due to \gls{toctou} issues, \ie it is impossible to ensure that the full set of resources are available without holding them all first. 750 Operators inside clauses in \CFA could potentially be implemented with careful circumvention of the problems. 751 Finally, the problem of operators inside clauses is also a difficult to handle when supporting channels. 752 It would require some way to ensure channels used with internal operators are modified, if and only if, the corresponding code block is run. 753 However, that is not feasible due to reasons described in the exclusive-or portion of Section~\ref{s:wu_chans}. 754 Ultimately, I did not deemed this feature to be crucial enough when taking into account the complexity and runtime cost. 751 755 752 756 \subsection{The full \lstinline{waituntil} picture} 753 757 754 Now the details have been discussed, the full pseudocode of the @waituntil@ is presented in Figure~\ref{f:WU_Full_Impl}.758 Given all the complex details handled by the @waituntil@, its full pseudocode is presented in Figure \ref{f:WU_Full_Impl}. 755 759 Some things to note are as follows. 756 760 The @finally@ blocks provide exception-safe \gls{raii} unregistering of nodes, and in particular, the @finally@ inside the innermost loop performs the immediate unregistering required for deadlock-freedom mentioned in Section~\ref{s:wu_locks}. … … 800 804 \end{figure} 801 805 802 \section{ WaituntilPerformance}806 \section{\lstinline{waituntil} Performance} 803 807 804 808 Similar facilities to @waituntil@ are discussed in Section~\ref{s:History} covering C, Ada, Rust, \CC, and OCaml. 805 809 However, these facilities are either not meaningful or feasible to benchmark against. 806 810 The UNIX @select@ and related utilities are not comparable since they are system calls that go into the kernel and operate on file descriptors, whereas the @waituntil@ exists solely in user space. 807 Ada's \lstinline[language=Ada]{select} and \uC's \lstinline[language=uC++]{_Accept} only operate s on method calls, which is done in \CFA via the @waitfor@ statement, so it is not meaningful to benchmark against the @waituntil@, which cannot wait on this resource.811 Ada's \lstinline[language=Ada]{select} and \uC's \lstinline[language=uC++]{_Accept} only operate on method calls, which is done in \CFA via the @waitfor@ statement. 808 812 Rust and \CC only offer a busy-wait approach, which is not comparable to a blocking approach. 809 813 OCaml's @select@ waits on channels that are not comparable with \CFA and Go channels, so OCaml @select@ is not benchmarked against Go's @select@ and \CFA's @waituntil@. … … 817 821 The channel multiplexing benchmarks compare \CFA's @waituntil@ and Go's \lstinline[language=Go]{select}, where the resource being waited on is a set of channels. 818 822 The basic structure of the benchmark has the number of cores split evenly between producer and consumer threads, \ie, with 8 cores there are 4 producer and 4 consumer threads. 819 The number of resource clauses $C$ is also varied across 2, 4, and 8 clauses, where each clause has a different channel that i swaits on.823 The number of resource clauses $C$ is also varied across 2, 4, and 8 clauses, where each clause has a different channel that it waits on. 820 824 Each producer and consumer repeatedly waits to either produce or consume from one of the $C$ clauses and respective channels. 821 825 For example, in \CFA syntax, the work loop in the consumer main with $C = 4$ clauses is: … … 896 900 Go then is holding a lock for every channel, resulting in worse performance as the number of channels increase. 897 901 In \CFA, since races are consolidated without holding all locks, it scales much better both with cores and clauses since more work can occur in parallel. 898 This scalability difference is more significant on the Intel machine than the AMD machine since the Intel machine has lower cachecontention costs.902 This scalability difference is more significant on the Intel machine than the AMD machine since the Intel has lower cache-contention costs. 899 903 900 904 The Go approach of holding all internal channel-locks in the \lstinline[language=Go]{select} has additional drawbacks. … … 928 932 In Go, this setup results in significantly worse performance since @P2@ and @C2@ cannot operate in parallel with @P1@ and @C1@ due to all locks being acquired. 929 933 Interesting, this case may not be as pathological as it seems. 930 If the set of channels belonging to a select have channels that overlap with the set of another select, these statements lose the ability to operate in parallel.934 If the set of channels belonging to a \lstinline[language=Go]{select} have channels that overlap with the set of another \lstinline[language=Go]{select}, these statements lose the ability to operate in parallel. 931 935 The implementation in \CFA only holds a single lock at a time, resulting in better locking granularity, and hence, more parallelism. 932 936 Comparison of this pathological case is shown in Table~\ref{t:pathGo}. 933 The AMD results highlight the worst 937 The AMD results highlight the worst-case scenario for Go since contention is more costly on this machine than the Intel machine. 934 938 935 939 \begin{table}[t] … … 962 966 The @waituntil@ statement checks for statement completion using a predicate function, whereas the \lstinline[language=uC++]{_Select} statement maintains a tree that represents the state of the internal predicate. 963 967 964 \begin{figure}965 \centering966 \subfloat[AMD Future Synchronization Benchmark]{967 \resizebox{0.5\textwidth}{!}{\input{figures/nasus_Future.pgf}}968 \label{f:futureAMD}969 }970 \subfloat[Intel Future Synchronization Benchmark]{971 \resizebox{0.5\textwidth}{!}{\input{figures/pyke_Future.pgf}}972 \label{f:futureIntel}973 }974 \caption{\CFA \lstinline{waituntil} and \uC \lstinline{_Select} statement throughput synchronizing on a set of futures with varying wait predicates (higher is better).}975 \caption{}976 \label{f:futurePerf}977 \end{figure}978 979 968 This benchmark aims to indirectly measure the impact of various predicates on the performance of the @waituntil@ and \lstinline[language=uC++]{_Select} statements. 980 969 The benchmark is indirect since the performance of futures in \CFA and \uC differ by a significant margin. … … 1015 1004 1016 1005 Results of this benchmark are shown in Figure~\ref{f:futurePerf}. 1017 Each pair of bars is marked with the predicate name for that experiment and the value at the top of each bar is the standard deviation. .1006 Each pair of bars is marked with the predicate name for that experiment and the value at the top of each bar is the standard deviation. 1018 1007 In detail, \uC results are lower in all cases due to the performance difference between futures and the more complex \gls{synch_multiplex} implementation. 1019 1008 However, the bars for both systems have similar height patterns across the experiments. … … 1022 1011 Interestingly, \CFA has lower variation across predicates on the AMD (excluding the special OR case), whereas \uC has lower variation on the Intel. 1023 1012 Given the differences in semantics and implementation between \uC and \CFA, this test only illustrates the overall costs among the different kinds of predicates. 1013 1014 \begin{figure} 1015 \centering 1016 \subfloat[AMD Future Synchronization Benchmark]{ 1017 \resizebox{0.5\textwidth}{!}{\input{figures/nasus_Future.pgf}} 1018 \label{f:futureAMD} 1019 } 1020 \subfloat[Intel Future Synchronization Benchmark]{ 1021 \resizebox{0.5\textwidth}{!}{\input{figures/pyke_Future.pgf}} 1022 \label{f:futureIntel} 1023 } 1024 \caption{\CFA \lstinline{waituntil} and \uC \lstinline{_Select} statement throughput synchronizing on a set of futures with varying wait predicates (higher is better).} 1025 \label{f:futurePerf} 1026 %%%%%%%%%%%%%%% 1027 \vspace*{5.5in} 1028 \end{figure} -
TabularUnified libcfa/src/heap.cfa ¶
r4852232 r210c737 10 10 // Created On : Tue Dec 19 21:58:35 2017 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Fri Dec 30 08:37:37 202213 // Update Count : 16 0512 // Last Modified On : Fri Jul 28 18:27:53 2023 13 // Update Count : 1612 14 14 // 15 15 … … 369 369 static __thread size_t PAD1 CALIGN TLSMODEL __attribute__(( unused )); // protect false sharing 370 370 static __thread Heap * heapManager CALIGN TLSMODEL; 371 static __thread bool heapManagerBootFlag CALIGN TLSMODEL = false; 371 372 static __thread size_t PAD2 CALIGN TLSMODEL __attribute__(( unused )); // protect further false sharing 372 373 … … 488 489 allocUnfreed = 0; 489 490 #endif // __CFA_DEBUG__ 491 heapManagerBootFlag = true; 490 492 } // with 491 493 } // if … … 499 501 500 502 lock( heapMaster.mgrLock ); // protect heapMaster counters 503 504 assert( ! heapManagerBootFlag ); 501 505 502 506 // get storage for heap manager … … 514 518 515 519 void heapManagerDtor() libcfa_public { 520 if ( unlikely( ! heapManagerBootFlag ) ) return; // thread never used ? 521 516 522 lock( heapMaster.mgrLock ); 517 523 … … 526 532 // Do not set heapManager to NULL because it is used after Cforall is shutdown but before the program shuts down. 527 533 534 heapManagerBootFlag = false; 528 535 unlock( heapMaster.mgrLock ); 529 536 } // heapManagerDtor … … 535 542 extern int cfa_main_returned; // from interpose.cfa 536 543 extern "C" { 537 void memory_startup( void ) { 544 void memory_startup( void ) { // singleton => called once at start of program 538 545 if ( ! heapMasterBootFlag ) heapManagerCtor(); // sanity check 539 546 } // memory_startup … … 895 902 #endif // __STATISTICS__ 896 903 904 // Uncomment to get allocation addresses for a 0-sized allocation rather than a null pointer. 905 //#define __NONNULL_0_ALLOC__ 906 #if ! defined( __NONNULL_0_ALLOC__ ) 907 #define __NULL_0_ALLOC__ unlikely( size == 0 ) || /* 0 BYTE ALLOCATION RETURNS NULL POINTER */ 908 #else 909 #define __NULL_0_ALLOC__ 910 #endif // __NONNULL_0_ALLOC__ 911 897 912 #define PROLOG( counter, ... ) \ 898 913 BOOT_HEAP_MANAGER; \ 899 if ( unlikely( size == 0 ) || /* 0 BYTE ALLOCATION RETURNS NULL POINTER */ \ 914 if ( \ 915 __NULL_0_ALLOC__ \ 900 916 unlikely( size > ULONG_MAX - sizeof(Heap.Storage) ) ) { /* error check */ \ 901 917 STAT_0_CNT( counter ); \ -
TabularUnified libcfa/src/iostream.cfa ¶
r4852232 r210c737 10 10 // Created On : Wed May 27 17:56:53 2015 11 11 // Last Modified By : Peter A. Buhr 12 // Last Modified On : Tue Jul 18 13:56:01202313 // Update Count : 140 312 // Last Modified On : Sun Jul 30 23:01:00 2023 13 // Update Count : 1406 14 14 // 15 15 … … 230 230 ostype & ?|?( ostype & os, const char s[] ) { 231 231 enum { Open = 1, Close, OpenClose }; 232 static const unsigned char mask[256] @= { 232 static const unsigned char mask[256] @= { // 256 covers all Latin-1 characters 233 233 // opening delimiters, no space after 234 234 ['('] : Open, ['['] : Open, ['{'] : Open,
Note: See TracChangeset
for help on using the changeset viewer.