Changeset 72d1118
- Timestamp:
- Sep 20, 2022, 2:42:15 PM (2 years ago)
- Branches:
- ADT, ast-experimental, master, pthread-emulation
- Children:
- 8f1e035
- Parents:
- aa9f215
- Location:
- doc
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/bibliography/pl.bib
raa9f215 r72d1118 188 188 Unstructured {\it sends\/} and {\it accepts\/} are forbidden. To 189 189 this the mechanisms of {\it delegation\/} and {\it delay queues\/} 190 are added to enable switching and triggering of activities. 191 Concurrent subactivities and atomic actions are provided for 190 are added to enable switching and triggering of activities. 191 Concurrent subactivities and atomic actions are provided for 192 192 compactness and simplicity. We show how solutions to many important 193 193 concurrent problems [sic], such as pipelining, constraint management 194 and ``administration'' can be compactly expressed using these 194 and ``administration'' can be compactly expressed using these 195 195 mechanisms. 196 196 }, … … 529 529 like ``c is a collection with element type e'', but how such things 530 530 are used isn't explained. 531 531 532 532 For each descriptive class used in a parameter list, an implicit 533 533 parameter is created that is passed a vector of procedures. … … 1172 1172 @techreport{Prokopec11, 1173 1173 keywords = {ctrie, concurrent map}, 1174 contributer = {a3moss@uwaterloo.ca}, 1174 contributer = {a3moss@uwaterloo.ca}, 1175 1175 title = {Cache-aware lock-free concurrent hash tries}, 1176 1176 author = {Prokopec, Aleksandar and Bagwell, Phil and Odersky, Martin}, … … 1500 1500 year = 2001, 1501 1501 url = {http://citeseer.ist.psu.edu/berger01composing.html} 1502 } 1502 } 1503 1503 1504 1504 @article{Andrews83, … … 1545 1545 We give a rationale for our decisions and compare Concurrent C 1546 1546 extensions with the concurrent programming facilities in Ada. 1547 Concurrent C has been implemented on the UNIX system running on a 1547 Concurrent C has been implemented on the UNIX system running on a 1548 1548 single processor. A distributed version of Concurrent C is being 1549 1549 implemented. … … 1814 1814 keywords = {objects, concurrency}, 1815 1815 contributer = {gjditchfield@plg}, 1816 author = {P. A. Buhr and G. J. Ditchfield and B. M. Younger and C. R. Zarnke}, 1816 author = {P. A. Buhr and G. J. Ditchfield and B. M. Younger and C. R. Zarnke}, 1817 1817 title = {Concurrency in {C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}}}, 1818 1818 institution = {Department of Computer Science, University of Waterloo}, … … 2044 2044 series = {Lecture Notes in Computer Science, Ed. by G. Goos and J. Hartmanis} 2045 2045 } 2046 2046 2047 2047 @article{Wang71, 2048 2048 keywords = {coroutines}, … … 2056 2056 pages = {425-449}, 2057 2057 } 2058 2058 2059 2059 @article{Castagna95, 2060 2060 keywords = {type-systems, covariance, contravariance}, … … 2390 2390 year = 1996, 2391 2391 } 2392 2392 2393 2393 @article{Richardson93, 2394 2394 keywords = {C++, persistence, database}, … … 2473 2473 publisher = {ACM}, 2474 2474 address = {New York, NY, USA}, 2475 } 2475 } 2476 2476 2477 2477 @article{design, … … 2700 2700 publisher = {ACM}, 2701 2701 address = {New York, NY, USA}, 2702 } 2702 } 2703 2703 2704 2704 @book{Eiffel, … … 3357 3357 publisher = {ACM}, 3358 3358 address = {New York, NY, USA}, 3359 } 3359 } 3360 3360 3361 3361 @manual{Fortran95, … … 3740 3740 keywords = {processes, distributed computing}, 3741 3741 contributer = {pabuhr@plg}, 3742 author = {Robert E. Strom and David F. Bacon and Arthur P. Goldberg and Andy Lowry and Daniel M. Yellin and Shaula Alexander Yemini}, 3742 author = {Robert E. Strom and David F. Bacon and Arthur P. Goldberg and Andy Lowry and Daniel M. Yellin and Shaula Alexander Yemini}, 3743 3743 title = {Hermes: A Language for Distributed Computing}, 3744 3744 institution = {IBM T. J. Watson Research Center}, … … 3751 3751 keywords = {processes, distributed computing}, 3752 3752 contributer = {pabuhr@plg}, 3753 author = {Robert E. Strom and David F. Bacon and Arthur P. Goldberg and Andy Lowry and Daniel M. Yellin and Shaula Alexander Yemini}, 3753 author = {Robert E. Strom and David F. Bacon and Arthur P. Goldberg and Andy Lowry and Daniel M. Yellin and Shaula Alexander Yemini}, 3754 3754 title = {Hermes: A Language for Distributed Computing}, 3755 3755 publisher = {Prentice-Hall}, … … 4302 4302 pages = {85-103} 4303 4303 } 4304 4304 4305 4305 @article{Murer96, 4306 4306 keywords = {interators, generators, cursors}, … … 4330 4330 4331 4331 % J 4332 4332 4333 4333 @book{Java, 4334 4334 keywords = {Java}, … … 4627 4627 publisher = {ACM}, 4628 4628 address = {New York, NY, USA}, 4629 } 4629 } 4630 4630 4631 4631 @article{Dice15, … … 4978 4978 number = 31 4979 4979 } 4980 4980 4981 4981 @article{Dueck90, 4982 4982 keywords = {attribute grammars}, … … 5107 5107 keywords = {multiple inheritance}, 5108 5108 contributer = {pabuhr@plg}, 5109 author = {Harry Bretthauer and Thomas Christaller and J\"{u}rgen Kopp}, 5109 author = {Harry Bretthauer and Thomas Christaller and J\"{u}rgen Kopp}, 5110 5110 title = {Multiple vs. Single Inheritance in Object-oriented Programming Languages. What do we really want?}, 5111 5111 institution = {Gesellschaft F\"{u}r Mathematik und Datenverarbeitung mbH}, … … 5650 5650 publisher = {ACM}, 5651 5651 address = {New York, NY, USA}, 5652 } 5652 } 5653 5653 5654 5654 @book{Deitel04, … … 5827 5827 year = 1980, 5828 5828 month = nov, volume = 15, number = 11, pages = {47-56}, 5829 note = {Proceedings of the ACM-SIGPLAN Symposium on the {Ada} Programming Language}, 5829 note = {Proceedings of the ACM-SIGPLAN Symposium on the {Ada} Programming Language}, 5830 5830 comment = { 5831 5831 The two-pass (bottom-up, then top-down) algorithm, with a proof … … 5957 5957 Given a base typed lambda calculus with function types, type 5958 5958 abstractions, and a recursive expression \(\mbox{fix } x:t.e\), 5959 then type inference for the partially typed language 5959 then type inference for the partially typed language 5960 5960 \begin{eqnarray} 5961 5961 \lambda x:\tau.e &\Rightarrow& \lambda x.e \\ … … 6603 6603 manner. The paper also discusses efficient composition of 6604 6604 sequences of asynchronous calls to different locations in a 6605 network. 6605 network. 6606 6606 } 6607 6607 } … … 6616 6616 volume = 32, number = 4, pages = {305-311}, 6617 6617 abstract = { 6618 6618 6619 6619 } 6620 6620 } … … 6934 6934 partitioning switch statements into dense tables. It also 6935 6935 implements target-independent function tracing and expression-level 6936 profiling. 6936 profiling. 6937 6937 } 6938 6938 } … … 7150 7150 publisher = {ACM}, 7151 7151 address = {New York, NY, USA}, 7152 } 7152 } 7153 7153 7154 7154 @inproceedings{Leissa14, … … 7268 7268 keywords = {Smalltalk, abstract class, protocol}, 7269 7269 contributer = {gjditchfield@plg}, 7270 author = {A. Goldberg and D. Robson}, 7270 author = {A. Goldberg and D. Robson}, 7271 7271 title = {Smalltalk-80: The Language and its Implementation}, 7272 7272 publisher = {Addison-Wesley}, … … 7845 7845 title = {Thread (computing)}, 7846 7846 author = {{Threading Model}}, 7847 howpublished= {\href{https://en.wikipedia.org/wiki/Thread_(computing)}{https://\-en.wikipedia.org/\-wiki/\-Thread\_ (computing)}},7847 howpublished= {\href{https://en.wikipedia.org/wiki/Thread_(computing)}{https://\-en.wikipedia.org/\-wiki/\-Thread\_\-(computing)}}, 7848 7848 } 7849 7849 … … 7889 7889 note = {Lecture Notes in Computer Science, v. 19}, 7890 7890 abstract = { 7891 7891 7892 7892 } 7893 7893 } … … 8008 8008 publisher = {USENIX Association}, 8009 8009 address = {Berkeley, CA, USA}, 8010 } 8010 } 8011 8011 8012 8012 @article{Leroy00, … … 8354 8354 author = {Bjarne Stroustrup}, 8355 8355 title = {What is ``Object-Oriented Programming''?}, 8356 booktitle = {Proceedings of the First European Conference on Object Oriented Programming}, 8356 booktitle = {Proceedings of the First European Conference on Object Oriented Programming}, 8357 8357 month = jun, 8358 8358 year = 1987 … … 8396 8396 publisher = {ACM}, 8397 8397 address = {New York, NY, USA}, 8398 } 8398 } 8399 8399 8400 8400 % X -
doc/theses/thierry_delisle_PhD/thesis/local.bib
raa9f215 r72d1118 583 583 title={Per-entity load tracking}, 584 584 author={Corbet, Jonathan}, 585 journal={LWN article, available at: https://lwn.net/Articles/531853},585 journal={LWN article, available at: {\href{https://lwn.net/Articles/531853}{https://\-lwn.net/\-Articles/\-531853}}}, 586 586 year={2013} 587 587 } … … 717 717 title = {Scheduling Benchmarks}, 718 718 author = {Thierry Delisle}, 719 howpublished = {\href{https://github.com/cforall/SchedulingBenchmarks_PhD22}{https://\-github.com/\-cforall/\-Scheduling Benchmarks\_\-PhD22}},719 howpublished = {\href{https://github.com/cforall/SchedulingBenchmarks_PhD22}{https://\-github.com/\-cforall/\-Scheduling\-Benchmarks\_\-PhD22}}, 720 720 } 721 721 … … 832 832 title = "eventfd(2) Linux User's Manual", 833 833 year = "2019", 834 month = "M Arch",834 month = "March", 835 835 } 836 836 … … 1060 1060 year = "2020", 1061 1061 month = "June", 1062 howpublished = "\href{https://xkcd.com/2318/} ",1062 howpublished = "\href{https://xkcd.com/2318/}{https://\-xkcd.com/\-2318/}", 1063 1063 note = "[Online; accessed 10-June-2020]" 1064 1064 } … … 1069 1069 year = "2011", 1070 1070 month = "June", 1071 howpublished = "\href{https://xkcd.com/908/} ",1071 howpublished = "\href{https://xkcd.com/908/}{https://\-xkcd.com/\-908/}", 1072 1072 note = "[Online; accessed 25-August-2022]" 1073 1073 } -
doc/theses/thierry_delisle_PhD/thesis/text/eval_micro.tex
raa9f215 r72d1118 187 187 Looking at the left column on AMD, Figures~\ref{fig:cycle:nasus:ops} and \ref{fig:cycle:nasus:ns} all 4 runtimes achieve very similar throughput and scalability. 188 188 However, as the number of \procs grows higher, the results on AMD show notably more variability than on Intel. 189 The different performance improvements and plateaus are due to cache topology and appear at the expected :\proc counts of 64, 128 and 192, for the same reasons as on Intel.189 The different performance improvements and plateaus are due to cache topology and appear at the expected \proc counts of 64, 128 and 192, for the same reasons as on Intel. 190 190 Looking next at the right column on AMD, Figures~\ref{fig:cycle:nasus:low:ops} and \ref{fig:cycle:nasus:low:ns}, Tokio and Go have the same throughput performance, while \CFA is slightly slower. 191 191 This result is different than on Intel, where Tokio behaved like \CFA rather than behaving like Go. … … 491 491 \section{Locality} 492 492 493 As mentioned in the churn benchmark, when \glslink{atsched}{unparking} a \at, it is possible to either \unpark to the local or remote ready-queue.\footnote{494 It is also possible to \unpark to a third unrelated ready-queue, but without additional knowledge about the situation, it is likely to degrade performance.}493 As mentioned in the churn benchmark, when \glslink{atsched}{unparking} a \at, it is possible to either \unpark to the local or remote sub-queue.\footnote{ 494 It is also possible to \unpark to a third unrelated sub-queue, but without additional knowledge about the situation, it is likely to degrade performance.} 495 495 The locality experiment includes two variations of the churn benchmark, where a data array is added. 496 496 In both variations, before @V@ing the semaphore, each \at calls a @work@ function which increments random cells inside the data array. … … 720 720 721 721 In both variations, the experiment effectively measures how long it takes for all \ats to run once after a given synchronization point. 722 In an ideal scenario where the scheduler is strictly FIFO, every thread would run once after the synchronization and therefore the delay between leaders would be given by, $ (CSL + SL) / (NP - 1)$,723 where $CSL$ is the context-switch latency, $SL$ is the cost for enqueueing and dequeuing a \at, and $NP$ is the number of \procs.722 In an ideal scenario where the scheduler is strictly FIFO, every thread would run once after the synchronization and therefore the delay between leaders would be given by, $NT(CSL + SL) / (NP - 1)$, 723 where $CSL$ is the context-switch latency, $SL$ is the cost for enqueueing and dequeuing a \at, $NT$ is the number of \ats, and $NP$ is the number of \procs. 724 724 However, if the scheduler allows \ats to run many times before other \ats can run once, this delay increases. 725 725 The semaphore version is an approximation of strictly FIFO scheduling, where none of the \ats \emph{attempt} to run more than once. … … 734 734 735 735 \begin{table} 736 \caption[Transfer Benchmark on Intel and AMD]{Transfer Benchmark on Intel and AMD\smallskip\newline Average measurement of how long it takes for all \ats to acknowledge the leader \at.737 DNC stands for ``did not complete'', meaning that after 5 seconds of a new leader being decided, some \ats still had not acknowledged the new leader.}738 \label{fig:transfer:res}739 736 \setlength{\extrarowheight}{2pt} 740 737 \setlength{\tabcolsep}{5pt} … … 753 750 \end{tabular} 754 751 \end{centering} 752 \caption[Transfer Benchmark on Intel and AMD]{Transfer Benchmark on Intel and AMD\smallskip\newline Average measurement of how long it takes for all \ats to acknowledge the leader \at. 753 DNC stands for ``did not complete'', meaning that after 5 seconds of a new leader being decided, some \ats still had not acknowledged the new leader.} 754 \label{fig:transfer:res} 755 755 \end{table} 756 756 … … 768 768 769 769 Looking at the next two columns, the results for the yield variation on Intel, the story is very different. 770 \CFA achieves better latencies, presumably due to nosynchronization with the yield.770 \CFA achieves better latencies, presumably due to the lack of synchronization with the yield. 771 771 Go does complete the experiment, but with drastically higher latency: 772 772 latency at 2 \procs is $350\times$ higher than \CFA and $70\times$ higher at 192 \procs. 773 This difference is because Go has a classic work-stealing scheduler, but it adds coarse-grain preemption 774 , which interrupts the spinning leader after a period. 773 This difference is because Go has a classic work-stealing scheduler, but it adds coarse-grain preemption, which interrupts the spinning leader after a period. 775 774 Neither Libfibre nor Tokio complete the experiment. 776 775 Both runtimes also use classical work-stealing scheduling without preemption, and therefore, none of the work queues are ever emptied so no load balancing occurs. -
doc/theses/thierry_delisle_PhD/thesis/text/io.tex
raa9f215 r72d1118 25 25 \paragraph{\lstinline{select}} is the oldest of these options, and takes as input a contiguous array of bits, where each bit represents a file descriptor of interest. 26 26 Hence, the array length must be as long as the largest FD currently of interest. 27 On return, it outputs the set in place to identify which of the file descriptors changed state.27 On return, it outputs the set motified in place to identify which of the file descriptors changed state. 28 28 This destructive change means selecting in a loop requires re-initializing the array for each iteration. 29 29 Another limit of @select@ is that calls from different \glspl{kthrd} sharing FDs are independent. … … 35 35 \paragraph{\lstinline{poll}} is the next oldest option, and takes as input an array of structures containing the FD numbers rather than their position in an array of bits, allowing a more compact input for interest sets that contain widely spaced FDs. 36 36 For small interest sets with densely packed FDs, the @select@ bit mask can take less storage, and hence, copy less information into the kernel. 37 Furthermore, @poll@ is non-destructive, so the array of structures does not have to be re-initialized on every call.37 However, @poll@ is non-destructive, so the array of structures does not have to be re-initialized on every call. 38 38 Like @select@, @poll@ suffers from the limitation that the interest set cannot be changed by other \glspl{kthrd}, while a manager thread is blocked in @poll@. 39 39 … … 314 314 A simple approach to polling is to allocate a \at per @io_uring@ instance and simply let the poller \ats poll their respective instances when scheduled. 315 315 316 With the pool of S EQinstances approach, the big advantage is that it is fairly flexible.316 With the pool of SQE instances approach, the big advantage is that it is fairly flexible. 317 317 It does not impose restrictions on what \ats submitting \io operations can and cannot do between allocations and submissions. 318 318 It also can gracefully handle running out of resources, SQEs or the kernel returning @EBUSY@. … … 320 320 The routing and allocation algorithm needs to keep track of which ring instances have available SQEs, block incoming requests if no instance is available, prevent barging if \ats are already queued up waiting for SQEs and handle SQEs being freed. 321 321 The submission side needs to safely append SQEs to the ring buffer, correctly handle chains, make sure no SQE is dropped or left pending forever, notify the allocation side when SQEs can be reused, and handle the kernel returning @EBUSY@. 322 Compared to the private-instance approach, all this synchronization has a significant cost this synchronization is entirely overhead.322 Compared to the private-instance approach, all this synchronization has a significant cost and this synchronization is entirely overhead. 323 323 324 324 \subsubsection{Instance borrowing} -
doc/theses/thierry_delisle_PhD/thesis/text/practice.tex
raa9f215 r72d1118 101 101 This leaves too many \procs when there are not enough \ats for all the \procs to be useful. 102 102 These idle \procs cannot be removed because their lifetime is controlled by the application, and only the application knows when the number of \ats may increase or decrease. 103 While idle \procs can spin until work appears, this approach wastes energy, unnecessarily produces heat and prevents other applications from using the processor.103 While idle \procs can spin until work appears, this approach wastes energy, unnecessarily produces heat and prevents other applications from using the \gls{hthrd}. 104 104 Therefore, idle \procs are put into an idle state, called \newterm{Idle-Sleep}, where the \gls{kthrd} is blocked until the scheduler deems it is needed. 105 105 … … 107 107 First, a data structure needs to keep track of all \procs that are in idle sleep. 108 108 Because idle sleep is spurious, this data structure has strict performance requirements, in addition to strict correctness requirements. 109 Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ o na pthread semaphore.109 Next, some mechanism is needed to block \glspl{kthrd}, \eg @pthread_cond_wait@ or a pthread semaphore. 110 110 The complexity here is to support \at \glslink{atblock}{parking} and \glslink{atsched}{unparking}, user-level locking, timers, \io operations, and all other \CFA features with minimal complexity. 111 111 Finally, the scheduler needs a heuristic to determine when to block and unblock an appropriate number of \procs.
Note: See TracChangeset
for help on using the changeset viewer.