Index: doc/theses/thierry_delisle_PhD/thesis/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/Makefile	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/Makefile	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -15,4 +15,5 @@
 	front \
 	intro \
+	existing \
 	runtime \
 	core \
@@ -27,4 +28,5 @@
 	base \
 	empty \
+	system \
 }
 
@@ -37,5 +39,5 @@
 ## Define the documents that need to be made.
 all: thesis.pdf
-thesis.pdf: ${TEXTS} ${FIGURES} ${PICTURES} glossary.tex
+thesis.pdf: ${TEXTS} ${FIGURES} ${PICTURES} glossary.tex local.bib
 
 DOCUMENT = thesis.pdf
Index: doc/theses/thierry_delisle_PhD/thesis/fig/system.fig
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/fig/system.fig	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/fig/system.fig	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -1,3 +1,3 @@
-#FIG 3.2  Produced by xfig version 3.2.5c
+#FIG 3.2  Produced by xfig version 3.2.7b
 Landscape
 Center
@@ -36,21 +36,4 @@
 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 3600 15 15 4500 3600 4515 3615
 -6
-6 3225 4125 4650 4425
-6 4350 4200 4650 4350
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4425 4275 15 15 4425 4275 4440 4290
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 4275 15 15 4500 4275 4515 4290
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4575 4275 15 15 4575 4275 4590 4290
--6
-1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3450 4275 225 150 3450 4275 3675 4425
-1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4050 4275 225 150 4050 4275 4275 4425
--6
-6 6675 4125 7500 4425
-6 7200 4200 7500 4350
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7275 4275 15 15 7275 4275 7290 4290
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7350 4275 15 15 7350 4275 7365 4290
-1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7425 4275 15 15 7425 4275 7440 4290
--6
-1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6900 4275 225 150 6900 4275 7125 4425
--6
 6 6675 3525 8025 3975
 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2
@@ -79,9 +62,8 @@
 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 2850 150 150 3975 2850 4125 2850
 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 7200 2775 150 150 7200 2775 7350 2775
-1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4860
+1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830
 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 7200 2775 30 30 7200 2775 7230 2805
 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 3600 150 150 3525 3600 3675 3600
-1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3875 4800 100 100 3875 4800 3975 4800
-1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4650 4800 150 75 4650 4800 4800 4875
+1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4625 4838 100 100 4625 4838 4725 4838
 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
 	 2400 4200 2400 3750 1950 3750 1950 4200 2400 4200
@@ -153,19 +135,18 @@
 	1 1 1.00 45.00 90.00
 	 7875 3750 7875 2325 7200 2325 7200 2550
+2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
+	 6975 4950 6750 4950 6750 4725 6975 4725 6975 4950
 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5
 	 5850 4950 5850 4725 5625 4725 5625 4950 5850 4950
-2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5
-	 6975 4950 6750 4950 6750 4725 6975 4725 6975 4950
-4 1 -1 0 0 0 10 0.0000 2 105 720 5550 4425 Processors\001
-4 1 -1 0 0 0 10 0.0000 2 120 1005 4200 3225 Blocked Tasks\001
-4 1 -1 0 0 0 10 0.0000 2 150 870 4200 3975 Ready Tasks\001
-4 1 -1 0 0 0 10 0.0000 2 135 1095 7350 1725 Other Cluster(s)\001
-4 1 -1 0 0 0 10 0.0000 2 105 840 4650 1725 User Cluster\001
-4 1 -1 0 0 0 10 0.0000 2 150 615 2175 3675 Manager\001
-4 1 -1 0 0 0 10 0.0000 2 105 990 2175 3525 Discrete-event\001
-4 1 -1 0 0 0 10 0.0000 2 135 795 2175 4350 preemption\001
-4 0 -1 0 0 0 10 0.0000 2 150 1290 2325 4875 generator/coroutine\001
-4 0 -1 0 0 0 10 0.0000 2 120 270 4050 4875 task\001
-4 0 -1 0 0 0 10 0.0000 2 105 450 7050 4875 cluster\001
-4 0 -1 0 0 0 10 0.0000 2 105 660 5925 4875 processor\001
-4 0 -1 0 0 0 10 0.0000 2 105 555 4875 4875 monitor\001
+4 1 -1 0 0 0 10 0.0000 2 135 900 5550 4425 Processors\001
+4 1 -1 0 0 0 10 0.0000 2 165 1170 4200 3975 Ready Threads\001
+4 1 -1 0 0 0 10 0.0000 2 165 1440 7350 1725 Other Cluster(s)\001
+4 1 -1 0 0 0 10 0.0000 2 135 1080 4650 1725 User Cluster\001
+4 1 -1 0 0 0 10 0.0000 2 165 630 2175 3675 Manager\001
+4 1 -1 0 0 0 10 0.0000 2 135 1260 2175 3525 Discrete-event\001
+4 1 -1 0 0 0 10 0.0000 2 150 900 2175 4350 preemption\001
+4 0 -1 0 0 0 10 0.0000 2 135 630 7050 4875 cluster\001
+4 1 -1 0 0 0 10 0.0000 2 135 1350 4200 3225 Blocked Threads\001
+4 0 -1 0 0 0 10 0.0000 2 135 540 4800 4875 thread\001
+4 0 -1 0 0 0 10 0.0000 2 120 810 5925 4875 processor\001
+4 0 -1 0 0 0 10 0.0000 2 165 1710 2325 4875 generator/coroutine\001
Index: doc/theses/thierry_delisle_PhD/thesis/glossary.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/glossary.tex	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/glossary.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -1,18 +1,52 @@
 \makeglossaries
+
+% ----------------------------------
+% Acronyms
+\newacronym{api}{API}{Application Programming Interface}
+\newacronym{fifo}{FIFO}{First-In, First-Out}
+\newacronym{io}{I/O}{Input and Output}
+\newacronym{numa}{NUMA}{Non-Uniform Memory Access}
+\newacronym{raii}{RAII}{Resource Acquisition Is Initialization}
+\newacronym{tls}{TLS}{Thread Local Storage}
+
+% ----------------------------------
+% Definitions
+
+\longnewglossaryentry{thrd}
+{name={thread}}
+{
+Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
+
+\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+}
+
+\longnewglossaryentry{proc}
+{name={processor}}
+{
+
+}
+
+\longnewglossaryentry{rQ}
+{name={ready-queue}}
+{
+
+}
+
+\longnewglossaryentry{uthrding}
+{name={user-level threading}}
+{
+
+
+\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+}
+
+% ----------------------------------
 
 \longnewglossaryentry{hthrd}
 {name={hardware thread}}
 {
-Threads representing the underlying hardware directly.
+Threads representing the underlying hardware directly, \eg the CPU core, or hyper-thread if the hardware supports multiple threads of execution per core. The number of hardware threads is considered to be always fixed to a specific number determined by the hardware.
 
-\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
-}
-
-\longnewglossaryentry{thrd}
-{name={threads}}
-{
-Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
-
-\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+\textit{Synonyms : }
 }
 
@@ -57,15 +91,5 @@
 }
 
-\longnewglossaryentry{proc}
-{name={virtual processor}}
-{
 
-}
-
-\longnewglossaryentry{Q}
-{name={work-queue}}
-{
-
-}
 
 \longnewglossaryentry{at}
@@ -131,7 +155,2 @@
 }
 
-
-\newacronym{tls}{TLS}{Thread Local Storage}
-\newacronym{api}{API}{Application Program Interface}
-\newacronym{raii}{RAII}{Resource Acquisition Is Initialization}
-\newacronym{numa}{NUMA}{Non-Uniform Memory Access}
Index: doc/theses/thierry_delisle_PhD/thesis/local.bib
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/local.bib	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
+++ doc/theses/thierry_delisle_PhD/thesis/local.bib	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -0,0 +1,610 @@
+% --------------------------------------------------
+% Cforall
+@misc{cfa:frontpage,
+  url = {https://cforall.uwaterloo.ca/}
+}
+@article{cfa:typesystem,
+  author    = {Aaron Moss and Robert Schluntz and Peter A. Buhr},
+  title     = {{\CFA} : Adding modern programming language features to {C}},
+  journal   = {Softw. Pract. Exp.},
+  volume    = {48},
+  number    = {12},
+  pages     = {2111--2146},
+  year      = {2018},
+  url       = {https://doi.org/10.1002/spe.2624},
+  doi       = {10.1002/spe.2624},
+  timestamp = {Thu, 09 Apr 2020 17:14:14 +0200},
+  biburl    = {https://dblp.org/rec/journals/spe/MossSB18.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+
+
+% --------------------------------------------------
+% old survey mostly about job scheduling
+% talks about the literature mostly centering on upfront/static scheduling with task graphs as inputs
+% already mentions multi-core
+@article{DBLP:journals/csur/Gonzalez77,
+  author    = {Mario J. Gonzalez Jr.},
+  title     = {Deterministic Processor Scheduling},
+  journal   = {{ACM} Comput. Surv.},
+  volume    = {9},
+  number    = {3},
+  pages     = {173--204},
+  year      = {1977},
+  url       = {https://doi.org/10.1145/356698.356700},
+  doi       = {10.1145/356698.356700},
+  timestamp = {Tue, 06 Nov 2018 12:50:48 +0100},
+  biburl    = {https://dblp.org/rec/journals/csur/Gonzalez77.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% very short survey, highlights more interseting surveys as :
+% Casavant and Kuhl [1988], Chapin [1993], Shirazi et al. [1995], and Singhal and Shivaratri [1994]
+% still seems to mention static or partially static scheduling as a dominating trend
+@article{DBLP:journals/csur/Chapin96,
+  author    = {Steve J. Chapin},
+  title     = {Distributed and Multiprocessor Scheduling},
+  journal   = {{ACM} Comput. Surv.},
+  volume    = {28},
+  number    = {1},
+  pages     = {233--235},
+  year      = {1996},
+  url       = {https://doi.org/10.1145/234313.234410},
+  doi       = {10.1145/234313.234410},
+  timestamp = {Tue, 06 Nov 2018 12:50:49 +0100},
+  biburl    = {https://dblp.org/rec/journals/csur/Chapin96.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% more comprehensive survey that discusses many algorithms
+% still exclusively static scheduling
+@article{DBLP:journals/csur/KwokA99,
+  author    = {Yu{-}Kwong Kwok and Ishfaq Ahmad},
+  title     = {Static scheduling algorithms for allocating directed task graphs to multiprocessors},
+  journal   = {{ACM} Comput. Surv.},
+  volume    = {31},
+  number    = {4},
+  pages     = {406--471},
+  year      = {1999},
+  url       = {https://doi.org/10.1145/344588.344618},
+  doi       = {10.1145/344588.344618},
+  timestamp = {Fri, 30 Nov 2018 12:48:46 +0100},
+  biburl    = {https://dblp.org/rec/journals/csur/KwokA99.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% recent survey on kernel-thread scheduling
+% specifically focusing on schedulers that try to optimize to
+% reduce contention or reduce cache conflicts or improve some other ressource sharing metric
+@article{DBLP:journals/csur/ZhuravlevSBFP12,
+  author    = {Sergey Zhuravlev and Juan Carlos Saez and Sergey Blagodurov and Alexandra Fedorova and Manuel Prieto},
+  title     = {Survey of scheduling techniques for addressing shared resources in multicore processors},
+  journal   = {{ACM} Comput. Surv.},
+  volume    = {45},
+  number    = {1},
+  pages     = {4:1--4:28},
+  year      = {2012},
+  url       = {https://doi.org/10.1145/2379776.2379780},
+  doi       = {10.1145/2379776.2379780},
+  timestamp = {Tue, 06 Nov 2018 12:50:49 +0100},
+  biburl    = {https://dblp.org/rec/journals/csur/ZhuravlevSBFP12.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% great survey on work-stealing
+% highlights many of the recent work both theoretical and not
+@article{DBLP:journals/ijpp/YangH18,
+  author    = {Jixiang Yang and Qingbi He},
+  title     = {Scheduling Parallel Computations by Work Stealing: {A} Survey},
+  journal   = {Int. J. Parallel Program.},
+  volume    = {46},
+  number    = {2},
+  pages     = {173--197},
+  year      = {2018},
+  url       = {https://doi.org/10.1007/s10766-016-0484-8},
+  doi       = {10.1007/s10766-016-0484-8},
+  timestamp = {Wed, 01 Apr 2020 08:50:06 +0200},
+  biburl    = {https://dblp.org/rec/journals/ijpp/YangH18.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% --------------------------------------------------
+% introduction of work stealing
+@inproceedings{DBLP:conf/fpca/BurtonS81,
+  author    = {F. Warren Burton and M. Ronan Sleep},
+  editor    = {Arvind and Jack B. Dennis},
+  title     = {Executing functional programs on a virtual tree of processors},
+  booktitle = {Proceedings of the 1981 conference on Functional programming languages and computer architecture, {FPCA} 1981, Wentworth, New Hampshire, USA, October 1981},
+  pages     = {187--194},
+  publisher = {{ACM}},
+  year      = {1981},
+  url       = {https://doi.org/10.1145/800223.806778},
+  doi       = {10.1145/800223.806778},
+  timestamp = {Tue, 06 Nov 2018 11:07:48 +0100},
+  biburl    = {https://dblp.org/rec/conf/fpca/BurtonS81.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% introduction of randomized work stealing
+@inproceedings{DBLP:conf/focs/Blumofe94,
+  author    = {Robert D. Blumofe},
+  title     = {Scheduling Multithreaded Computations by Work Stealing},
+  booktitle = {35th Annual Symposium on Foundations of Computer Science, Santa Fe, New Mexico, USA, 20-22 November 1994},
+  pages     = {356--368},
+  publisher = {{IEEE} Computer Society},
+  year      = {1994},
+  url       = {https://doi.org/10.1109/SFCS.1994.365680},
+  doi       = {10.1109/SFCS.1994.365680},
+  timestamp = {Wed, 16 Oct 2019 14:14:54 +0200},
+  biburl    = {https://dblp.org/rec/conf/focs/Blumofe94.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% migration cost
+@inproceedings{DBLP:conf/sigmetrics/SquillanteN91,
+  author    = {Mark S. Squillante and Randolph D. Nelson},
+  editor    = {Tom W. Keller},
+  title     = {Analysis of Task Migration in Shared-Memory Multiprocessor Scheduling},
+  booktitle = {Proceedings of the 1991 {ACM} {SIGMETRICS} conference on Measurement and modeling of computer systems, San Diego, California, USA, May 21-24, 1991},
+  pages     = {143--155},
+  publisher = {{ACM}},
+  year      = {1991},
+  url       = {https://doi.org/10.1145/107971.107987},
+  doi       = {10.1145/107971.107987},
+  timestamp = {Sat, 07 Sep 2019 11:59:22 +0200},
+  biburl    = {https://dblp.org/rec/conf/sigmetrics/SquillanteN91.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/pe/EagerLZ86,
+  author    = {Derek L. Eager and Edward D. Lazowska and John Zahorjan},
+  title     = {A Comparison of Receiver-Initiated and Sender-Initiated Adaptive Load Sharing},
+  journal   = {Perform. Evaluation},
+  volume    = {6},
+  number    = {1},
+  pages     = {53--68},
+  year      = {1986},
+  url       = {https://doi.org/10.1016/0166-5316(86)90008-8},
+  doi       = {10.1016/0166-5316(86)90008-8},
+  timestamp = {Sat, 22 Feb 2020 19:26:16 +0100},
+  biburl    = {https://dblp.org/rec/journals/pe/EagerLZ86.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% affinity for work-stealing
+@article{DBLP:journals/tpds/SquillanteL93,
+  author    = {Mark S. Squillante and Edward D. Lazowska},
+  title     = {Using Processor-Cache Affinity Information in Shared-Memory Multiprocessor Scheduling},
+  journal   = {{IEEE} Trans. Parallel Distributed Syst.},
+  volume    = {4},
+  number    = {2},
+  pages     = {131--143},
+  year      = {1993},
+  url       = {https://doi.org/10.1109/71.207589},
+  doi       = {10.1109/71.207589},
+  timestamp = {Fri, 02 Oct 2020 14:40:30 +0200},
+  biburl    = {https://dblp.org/rec/journals/tpds/SquillanteL93.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+ systems with affinity scheduling
+@inproceedings{squillante2001threshold,
+  title={Threshold-based priority policies for parallel-server systems with affinity scheduling},
+  author={Squillante, Mark S and Xia, Cathy H and Yao, David D and Zhang, Li},
+  booktitle={Proceedings of the 2001 American Control Conference.(Cat. No. 01CH37148)},
+  volume={4},
+  pages={2992--2999},
+  year={2001},
+  organization={IEEE}
+}
+
+@article{DBLP:journals/mst/AcarBB02,
+  author    = {Umut A. Acar and Guy E. Blelloch and Robert D. Blumofe},
+  title     = {The Data Locality of Work Stealing},
+  journal   = {Theory Comput. Syst.},
+  volume    = {35},
+  number    = {3},
+  pages     = {321--347},
+  year      = {2002},
+  url       = {https://doi.org/10.1007/s00224-002-1057-3},
+  doi       = {10.1007/s00224-002-1057-3},
+  timestamp = {Sun, 28 May 2017 13:18:25 +0200},
+  biburl    = {https://dblp.org/rec/journals/mst/AcarBB02.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/tcs/NarangS11,
+  author    = {Ankur Narang and Rudrapatna K. Shyamasundar},
+  title     = {Performance driven distributed scheduling of parallel hybrid computations},
+  journal   = {Theor. Comput. Sci.},
+  volume    = {412},
+  number    = {32},
+  pages     = {4212--4225},
+  year      = {2011},
+  url       = {https://doi.org/10.1016/j.tcs.2010.11.044},
+  doi       = {10.1016/j.tcs.2010.11.044},
+  timestamp = {Sun, 28 May 2017 13:20:06 +0200},
+  biburl    = {https://dblp.org/rec/journals/tcs/NarangS11.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+Optimization via reflection on work stealing in TBB
+@inproceedings{DBLP:conf/ipps/RobisonVK08,
+  author    = {Arch Robison and Michael Voss and Alexey Kukanov},
+  title     = {Optimization via Reflection on Work Stealing in {TBB}},
+  booktitle = {22nd {IEEE} International Symposium on Parallel and Distributed Processing, {IPDPS} 2008, Miami, Florida USA, April 14-18, 2008},
+  pages     = {1--8},
+  publisher = {{IEEE}},
+  year      = {2008},
+  url       = {https://doi.org/10.1109/IPDPS.2008.4536188},
+  doi       = {10.1109/IPDPS.2008.4536188},
+  timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
+  biburl    = {https://dblp.org/rec/conf/ipps/RobisonVK08.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/ipl/SuksompongLS16,
+  author    = {Warut Suksompong and Charles E. Leiserson and Tao B. Schardl},
+  title     = {On the efficiency of localized work stealing},
+  journal   = {Inf. Process. Lett.},
+  volume    = {116},
+  number    = {2},
+  pages     = {100--106},
+  year      = {2016},
+  url       = {https://doi.org/10.1016/j.ipl.2015.10.002},
+  doi       = {10.1016/j.ipl.2015.10.002},
+  timestamp = {Fri, 26 May 2017 22:54:40 +0200},
+  biburl    = {https://dblp.org/rec/journals/ipl/SuksompongLS16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+%theory
+@article{DBLP:journals/jpdc/MirchandaneyTS90,
+  author    = {Ravi Mirchandaney and Donald F. Towsley and John A. Stankovic},
+  title     = {Adaptive Load Sharing in Heterogeneous Distributed Systems},
+  journal   = {J. Parallel Distributed Comput.},
+  volume    = {9},
+  number    = {4},
+  pages     = {331--346},
+  year      = {1990},
+  url       = {https://doi.org/10.1016/0743-7315(90)90118-9},
+  doi       = {10.1016/0743-7315(90)90118-9},
+  timestamp = {Sat, 22 Feb 2020 19:36:31 +0100},
+  biburl    = {https://dblp.org/rec/journals/jpdc/MirchandaneyTS90.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/mst/BenderR02,
+  author    = {Michael A. Bender and Michael O. Rabin},
+  title     = {Online Scheduling of Parallel Programs on Heterogeneous Systems with Applications to Cilk},
+  journal   = {Theory Comput. Syst.},
+  volume    = {35},
+  number    = {3},
+  pages     = {289--304},
+  year      = {2002},
+  url       = {https://doi.org/10.1007/s00224-002-1055-5},
+  doi       = {10.1007/s00224-002-1055-5},
+  timestamp = {Sun, 28 May 2017 13:18:24 +0200},
+  biburl    = {https://dblp.org/rec/journals/mst/BenderR02.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/sigmetrics/GastG10,
+  author    = {Nicolas Gast and Bruno Gaujal},
+  editor    = {Vishal Misra and Paul Barford and Mark S. Squillante},
+  title     = {A mean field model of work stealing in large-scale systems},
+  booktitle = {{SIGMETRICS} 2010, Proceedings of the 2010 {ACM} {SIGMETRICS} International Conference on Measurement and Modeling of Computer Systems, New York, New York, USA, 14-18 June 2010},
+  pages     = {13--24},
+  publisher = {{ACM}},
+  year      = {2010},
+  url       = {https://doi.org/10.1145/1811039.1811042},
+  doi       = {10.1145/1811039.1811042},
+  timestamp = {Tue, 06 Nov 2018 11:07:18 +0100},
+  biburl    = {https://dblp.org/rec/conf/sigmetrics/GastG10.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/jacm/BlellochGM99,
+  author    = {Guy E. Blelloch and Phillip B. Gibbons and Yossi Matias},
+  title     = {Provably Efficient Scheduling for Languages with Fine-Grained Parallelism},
+  journal   = {J. {ACM}},
+  volume    = {46},
+  number    = {2},
+  pages     = {281--321},
+  year      = {1999},
+  url       = {https://doi.org/10.1145/301970.301974},
+  doi       = {10.1145/301970.301974},
+  timestamp = {Tue, 06 Nov 2018 12:51:45 +0100},
+  biburl    = {https://dblp.org/rec/journals/jacm/BlellochGM99.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/siamcomp/BerenbrinkFG03,
+  author    = {Petra Berenbrink and Tom Friedetzky and Leslie Ann Goldberg},
+  title     = {The Natural Work-Stealing Algorithm is Stable},
+  journal   = {{SIAM} J. Comput.},
+  volume    = {32},
+  number    = {5},
+  pages     = {1260--1279},
+  year      = {2003},
+  url       = {https://doi.org/10.1137/S0097539701399551},
+  doi       = {10.1137/S0097539701399551},
+  timestamp = {Sat, 27 May 2017 14:22:58 +0200},
+  biburl    = {https://dblp.org/rec/journals/siamcomp/BerenbrinkFG03.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/mst/AroraBP01,
+  author    = {Nimar S. Arora and Robert D. Blumofe and C. Greg Plaxton},
+  title     = {Thread Scheduling for Multiprogrammed Multiprocessors},
+  journal   = {Theory Comput. Syst.},
+  volume    = {34},
+  number    = {2},
+  pages     = {115--144},
+  year      = {2001},
+  url       = {https://doi.org/10.1007/s00224-001-0004-z},
+  doi       = {10.1007/s00224-001-0004-z},
+  timestamp = {Sun, 28 May 2017 13:18:24 +0200},
+  biburl    = {https://dblp.org/rec/journals/mst/AroraBP01.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@article{DBLP:journals/anor/TchiboukdjianGT13,
+  author    = {Marc Tchiboukdjian and Nicolas Gast and Denis Trystram},
+  title     = {Decentralized list scheduling},
+  journal   = {Ann. Oper. Res.},
+  volume    = {207},
+  number    = {1},
+  pages     = {237--259},
+  year      = {2013},
+  url       = {https://doi.org/10.1007/s10479-012-1149-7},
+  doi       = {10.1007/s10479-012-1149-7},
+  timestamp = {Thu, 13 Aug 2020 12:41:25 +0200},
+  biburl    = {https://dblp.org/rec/journals/anor/TchiboukdjianGT13.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/isaac/TchiboukdjianGTRB10,
+  author    = {Marc Tchiboukdjian and Nicolas Gast and Denis Trystram and Jean{-}Louis Roch and Julien Bernard},
+  editor    = {Otfried Cheong and Kyung{-}Yong Chwa and Kunsoo Park},
+  title     = {A Tighter Analysis of Work Stealing},
+  booktitle = {Algorithms and Computation - 21st International Symposium, {ISAAC} 2010, Jeju Island, Korea, December 15-17, 2010, Proceedings, Part {II}},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {6507},
+  pages     = {291--302},
+  publisher = {Springer},
+  year      = {2010},
+  url       = {https://doi.org/10.1007/978-3-642-17514-5\_25},
+  doi       = {10.1007/978-3-642-17514-5\_25},
+  timestamp = {Fri, 13 Dec 2019 13:08:09 +0100},
+  biburl    = {https://dblp.org/rec/conf/isaac/TchiboukdjianGTRB10.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/ppopp/AgrawalLS10,
+  author    = {Kunal Agrawal and Charles E. Leiserson and Jim Sukha},
+  editor    = {R. Govindarajan and David A. Padua and Mary W. Hall},
+  title     = {Helper locks for fork-join parallel programming},
+  booktitle = {Proceedings of the 15th {ACM} {SIGPLAN} Symposium on Principles and Practice of Parallel Programming, {PPOPP} 2010, Bangalore, India, January 9-14, 2010},
+  pages     = {245--256},
+  publisher = {{ACM}},
+  year      = {2010},
+  url       = {https://doi.org/10.1145/1693453.1693487},
+  doi       = {10.1145/1693453.1693487},
+  timestamp = {Tue, 06 Nov 2018 16:57:27 +0100},
+  biburl    = {https://dblp.org/rec/conf/ppopp/AgrawalLS10.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/spaa/AgrawalFLSSU14,
+  author    = {Kunal Agrawal and Jeremy T. Fineman and Kefu Lu and Brendan Sheridan and Jim Sukha and Robert Utterback},
+  editor    = {Guy E. Blelloch and Peter Sanders},
+  title     = {Provably good scheduling for parallel programs that use data structures through implicit batching},
+  booktitle = {26th {ACM} Symposium on Parallelism in Algorithms and Architectures, {SPAA} '14, Prague, Czech Republic - June 23 - 25, 2014},
+  pages     = {84--95},
+  publisher = {{ACM}},
+  year      = {2014},
+  url       = {https://doi.org/10.1145/2612669.2612688},
+  doi       = {10.1145/2612669.2612688},
+  timestamp = {Wed, 21 Nov 2018 11:18:43 +0100},
+  biburl    = {https://dblp.org/rec/conf/spaa/AgrawalFLSSU14.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{DBLP:conf/ipps/ColeR13,
+  author    = {Richard Cole and
+               Vijaya Ramachandran},
+  title     = {Analysis of Randomized Work Stealing with False Sharing},
+  booktitle = {27th {IEEE} International Symposium on Parallel and Distributed Processing,
+               {IPDPS} 2013, Cambridge, MA, USA, May 20-24, 2013},
+  pages     = {985--998},
+  publisher = {{IEEE} Computer Society},
+  year      = {2013},
+  url       = {https://doi.org/10.1109/IPDPS.2013.86},
+  doi       = {10.1109/IPDPS.2013.86},
+  timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
+  biburl    = {https://dblp.org/rec/conf/ipps/ColeR13.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% --------------------------------------------------
+% ULE FreeBSD scheduler
+@inproceedings{DBLP:conf/bsdcon/Roberson03,
+  author    = {Jeff Roberson},
+  editor    = {Gregory Neil Shapiro},
+  title     = {{ULE:} {A} Modern Scheduler for FreeBSD},
+  booktitle = {Proceedings of BSDCon 2003, San Mateo, California, USA, September 8-12, 2003},
+  pages     = {17--28},
+  publisher = {{USENIX}},
+  year      = {2003},
+  url       = {http://www.usenix.org/publications/library/proceedings/bsdcon03/tech/roberson.html},
+  timestamp = {Wed, 04 Jul 2018 13:06:34 +0200},
+  biburl    = {https://dblp.org/rec/conf/bsdcon/Roberson03.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% --------------------------------------------------
+% Martin's LibFibre
+@article{DBLP:journals/pomacs/KarstenB20,
+  author    = {Martin Karsten and Saman Barghi},
+  title     = {User-level Threading: Have Your Cake and Eat It Too},
+  journal   = {Proc. {ACM} Meas. Anal. Comput. Syst.},
+  volume    = {4},
+  number    = {1},
+  pages     = {17:1--17:30},
+  year      = {2020},
+  url       = {https://doi.org/10.1145/3379483},
+  doi       = {10.1145/3379483},
+  timestamp = {Thu, 09 Jul 2020 22:58:54 +0200},
+  biburl    = {https://dblp.org/rec/journals/pomacs/KarstenB20.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+% --------------------------------------------------
+% Linux CFS
+@inproceedings{DBLP:conf/eurosys/LoziLFGQF16,
+  author    = {Jean{-}Pierre Lozi and Baptiste Lepers and Justin R. Funston and Fabien Gaud and Vivien Qu{\'{e}}ma and Alexandra Fedorova},
+  editor    = {Cristian Cadar and Peter R. Pietzuch and Kimberly Keeton and Rodrigo Rodrigues},
+  title     = {The Linux scheduler: a decade of wasted cores},
+  booktitle = {Proceedings of the Eleventh European Conference on Computer Systems, EuroSys 2016, London, United Kingdom, April 18-21, 2016},
+  pages     = {1:1--1:16},
+  publisher = {{ACM}},
+  year      = {2016},
+  url       = {https://doi.org/10.1145/2901318.2901326},
+  doi       = {10.1145/2901318.2901326},
+  timestamp = {Tue, 06 Nov 2018 16:58:31 +0100},
+  biburl    = {https://dblp.org/rec/conf/eurosys/LoziLFGQF16.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@misc{MAN:linux/cfs,
+  title = {{CFS} Scheduler - The Linux Kernel documentation},
+  url = {https://www.kernel.org/doc/html/latest/scheduler/sched-design-CFS.html}
+}
+
+@misc{MAN:linux/cfs2,
+  title = {{CFS}: Completely fair process scheduling in Linux},
+  author = {Marty Kalin},
+  year = {2019},
+  month = {February},
+  url = {https://opensource.com/article/19/2/fair-scheduling-linux}
+}
+
+@article{MAN:linux/cfs/pelt,
+  title={Per-entity load tracking},
+  author={Corbet, Jonathan},
+  journal={LWN article, available at: https://lwn.net/Articles/531853},
+  year={2013}
+}
+
+@article{MAN:linux/cfs/balancing,
+  title={Reworking {CFS} load balancing},
+  journal={LWN article, available at: https://lwn.net/Articles/793427/},
+  year={2013}
+}
+
+@manual{MAN:linux/sched,
+  title = {SCHED(7) - Linux Programmer's Manual},
+  url   = {https://man7.org/linux/man-pages/man7/sched.7.html},
+  year  = {2019},
+  month = {august}
+}
+
+% Apple's MAC OS X
+@manual{MAN:apple/scheduler,
+  title = {Mach Scheduling and Thread Interfaces - Kernel Programming Guide},
+  organization = {Apple Inc.},
+  url = {https://developer.apple.com/library/archive/documentation/Darwin/Conceptual/KernelProgramming/scheduler/scheduler.html}
+}
+
+Windows's Scheduler
+@inbook{MAN:windows/scheduler,
+  author = {Kate Chase and Mark E. Russinovich},
+  title = {Windows Internals},
+  chapter = {Processes, Threads, and Jobs in the Windows Operating System},
+  edition = {5th Edition},
+  publisher = {Microsoft Press},
+  year = {2009},
+  month = {June},
+  series = {Developer Reference},
+  url = {https://www.microsoftpressstore.com/articles/article.aspx?p=2233328&seqNum=7#:~:text=Overview\%20of\%20Windows\%20Scheduling,a\%20phenomenon\%20called\%20processor\%20affinity}
+}
+
+@online{GITHUB:go,
+  title = {GitHub - The Go Programming Language},
+  author = {The Go Programming Language},
+  url = {https://github.com/golang/go},
+  version = {Change-Id: If07f40b1d73b8f276ee28ffb8b7214175e56c24d}
+}
+
+@inproceedings{YTUBE:go,
+  author = {Dmitry Vyukov},
+  title = {Go scheduler: Implementing language with lightweight concurrency},
+  year = {2019},
+  booktitle = {Hydra},
+  url = {https://www.youtube.com/watch?v=-K11rY57K7k&ab_channel=Hydra}
+}
+
+@inproceedings{:erlang,
+  author = {Kenneth Lundin, Ericsson AB},
+  title = {Inside the Erlang VM},
+  year = {2008},
+  booktitle = {Erlang User Conference},
+  url = {http://www.erlang.se/euc/08/euc_smp.pdf}
+}
+
+
+
+@manual{MAN:tbb/scheduler,
+  title = {Scheduling Algorithm - Intel{\textregistered} Threading Building Blocks Developer Reference},
+  organization = {Intel{\textregistered}},
+  url = {https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/scheduling_algorithm.html}
+}
+
+@manual{MAN:quasar,
+  title = {Quasar Core - Quasar User Manual},
+  organization = {Parallel Universe},
+  url = {https://docs.paralleluniverse.co/quasar/}
+}
+@misc{MAN:project-loom,
+  url = {https://www.baeldung.com/openjdk-project-loom}
+}
+
+@misc{MAN:java/fork-join,
+  url = {https://www.baeldung.com/java-fork-join}
+}
+
+% --------------------------------------------------
+% Wikipedia Entries
+@misc{wiki:taskparallel,
+  author = "{Wikipedia contributors}",
+  title = "Control theory --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2020",
+  url = "https://en.wikipedia.org/wiki/Task_parallelism",
+  note = "[Online; accessed 22-October-2020]"
+}
+
+@misc{wiki:controltheory,
+  author = "{Wikipedia contributors}",
+  title = "Task parallelism --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2020",
+  url = "https://en.wikipedia.org/wiki/Control_theory",
+  note = "[Online; accessed 22-October-2020]"
+}
+
+@misc{wiki:implicitpar,
+  author = "{Wikipedia contributors}",
+  title = "Implicit parallelism --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2020",
+  url = "https://en.wikipedia.org/wiki/Implicit_parallelism",
+  note = "[Online; accessed 23-October-2020]"
+}
+
+@misc{wiki:explicitpar,
+  author = "{Wikipedia contributors}",
+  title = "Explicit parallelism --- {W}ikipedia{,} The Free Encyclopedia",
+  year = "2017",
+  url = "https://en.wikipedia.org/wiki/Explicit_parallelism",
+  note = "[Online; accessed 23-October-2020]"
+}
Index: doc/theses/thierry_delisle_PhD/thesis/text/core.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/core.tex	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/text/core.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -1,15 +1,34 @@
 \chapter{Scheduling Core}\label{core}
 
-This chapter addresses the need of scheduling on a somewhat ideal scenario
+Before discussing scheduling in general, where it is important to address systems that are changing states, this document discusses scheduling in a somewhat ideal scenerio, where the system has reached a steady state. For this purpose, a steady state is loosely defined as a state where there are always \glspl{thrd} ready to run and but the system has the ressources necessary to accomplish the work. In short, the system is neither overloaded or underloaded.
 
-\section{Existing Schedulers}
-\subsection{Feedback Scheduling}
+I believe it is important to discuss the steady state first because it is the easiest case to handle and, relatedly, the case in which the best performance is to be expected. As such, when the system is either overloaded or underloaded, a common approach is to try to adapt the system to the new load and return to the steady state. Flaws in the scheduling in the steady state tend therefore to be pervasive in all states.
 
-\subsection{Priority Scheduling}\label{priority}
+\section{Design Goals}
+As with most of the design decisions behind \CFA, the main goal is to match the expectation of the programmer, according to their probable mental model. To match these expectations, the design must offer the programmers sufficient guarantees so that, as long as the programmer respects the mental model, the system will also respect this model.
 
-\subsection{Work Stealing}
+For threading, a simple and common mental model is the ``Ideal multi-tasking CPU'' :
+
+\begin{displayquote}[Linux CFS\cit{https://www.kernel.org/doc/Documentation/scheduler/sched-design-CFS.txt}]
+	{[The]} ``Ideal multi-tasking CPU'' is a (non-existent  :-)) CPU that has 100\% physical power and which can run each task at precise equal speed, in parallel, each at [an equal fraction of the] speed.  For example: if there are 2 tasks running, then it runs each at 50\% physical power --- i.e., actually in parallel.
+\end{displayquote}
+
+Applied to threads, this model states that every ready \gls{thrd} immediately runs in parallel with all other ready \glspl{thrd}. While a strict implementation of this model is not feasible, programmers still have expectations about scheduling that come from this model.
+
+In general, the expectation at the center of this model is that ready \glspl{thrd} do not interfere with eachother but simply share the hardware. This makes it easier to reason about threading because ready \glspl{thrd} can be taken in isolation and the effect of the scheduler can be virtually ignored. This expectation of \gls{thrd} independence means the scheduler is expected to offer 2 guarantees:
+\begin{enumerate}
+	\item A fairness guarantee: a \gls{thrd} that is ready to run will not be prevented to do so by another thread.
+	\item A performance guarantee: a \gls{thrd} that wants to start or stop running will not be slowed down by other threads wanting to do the same.
+\end{enumerate}
+
+It is important to note that these guarantees are expected only up to a point. \Glspl{thrd} that are ready to run should not be prevented to do so, but they still need to share a limited amount of hardware. Therefore, the guarantee is considered respected if a \gls{thrd} gets access to a \emph{fair share} of the hardware, even if that share is very small.
+
+Similarly the performance guarantee, the lack of interferance between threads is only relevant op to a point. Ideally the cost of running and blocking would be constant regardless of contention, but the guarantee is considered satisfied if the cost is not \emph{too high} with or without contention. How much is an acceptable cost is obviously highly variable. For this document the performance experimentation will attempt to show that the cost of scheduling is not a major factor in application performance. This demonstration can be made by comparing application built in \CFA to applications built with other languages or other models. If the performance of an application built in \CFA is not meaningfully different than one built with a different runtime, then the scheduler has a negigeable impact on performance, \ie its impact can be ignored. Recall from a few paragraphs ago that the expectation of programmers is that the impact of the scheduler can be ignored. Therefore, if the cost of scheduling is not a significant portion of the runtime of several different application, I will consider the guarantee achieved.
+
+\todo{This paragraph should be moved later}
+% The next step is then to decide what is considered a \emph{fair share}, \ie what metric is used to measure fairness. Since \CFA is intended to allow numerous short lived threads, I decided to avoid total CPU time as the measure of fairness. Total CPU time inherently favors new \glspl{thrd} over older ones which isn't necessarily a good thing. Instead, fairness is measured in terms of opportunities to run. This metric is more appropriate for a mix of short and long lived \glspl{thrd}.
 
 \section{Design}
-While avoiding the pitfalls of Feedback Scheduling is fairly easy, scheduling does not innately require feedback, avoiding prioritization of \glspl{thrd} is more difficult because of implicitly priorities, see Subsection~\ref{priority}.
+While avoiding the pitfalls of Feedback Scheduling is fairly easy, scheduling does not innately require feedback, avoiding prioritization of \glspl{thrd} is more difficult because of implicitly priorities, see Subsection~\ref{priority}. A strictly \glsxtrshort{fifo} rea
 
 \subsection{Sharding}
@@ -19,7 +38,8 @@
 		\input{base.pstex_t}
 	\end{center}
-	\caption{Relaxed FIFO list at the base of the scheduler: an array of strictly FIFO lists.
-	The timestamp is in all nodes and cell arrays.}
+	\caption{Relaxed FIFO list}
 	\label{fig:base}
+	List at the base of the scheduler: an array of strictly FIFO lists.
+	The timestamp is in all nodes and cell arrays.
 \end{figure}
 
@@ -28,12 +48,4 @@
 Indeed, if the number of \glspl{thrd} does not far exceed the number of queues, it is probable that several of these queues are empty.
 Figure~\ref{fig:empty} shows an example with 2 \glspl{thrd} running on 8 queues, where the chances of getting an empty queue is 75\% per pick, meaning two random picks yield a \gls{thrd} only half the time.
-This can lead to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
-
-Solutions to this problem can take many forms, but they ultimately all have to encode where the threads are in some form. My results show that the density and locality of this encoding is generally the dominating factor in these scheme.
-
-\paragraph{Dense Information}
-
-
-
 
 
@@ -42,5 +54,12 @@
 		\input{empty.pstex_t}
 	\end{center}
-	\caption{``More empty'' state of the queue: the array contains many empty cells.}
+	\caption{``More empty'' Relaxed FIFO list}
 	\label{fig:empty}
+	Emptier state of the queue: the array contains many empty cells, that is strictly FIFO lists containing no elements.
 \end{figure}
+
+This can lead to performance problems since picks that do not yield a \gls{thrd} are not useful and do not necessarily help make more informed guesses.
+
+Solutions to this problem can take many forms, but they ultimately all have to encode where the threads are in some form. My results show that the density and locality of this encoding is generally the dominating factor in these scheme.
+
+\paragraph{Dense Information}
Index: doc/theses/thierry_delisle_PhD/thesis/text/existing.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/existing.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
+++ doc/theses/thierry_delisle_PhD/thesis/text/existing.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -0,0 +1,137 @@
+\chapter{Previous Work}\label{existing}
+Scheduling is a topic with a very long history, predating its use in computer science. As such, early work in computed science was inspired from other fields and focused principally on solving scheduling upfront rather that as the system is running.
+
+\section{Naming Convention}
+Scheduling has been studied by various different communities concentrating on different incarnation of the same problems. As a result, their is no real naming convention for scheduling that is respected across these communities. For this document, I will use the term \newterm{task} to refer to the abstract objects being scheduled and the term \newterm{worker} to refer to the objects which will execute these tasks.
+
+\section{Static Scheduling}
+Static schedulers require that programmers explicitly and exhaustively specify dependencies among tasks in order to schedule them. The scheduler then processes this input ahead of time and producess a \newterm{schedule} to which the system can later adhere. An example application for these schedulers
+
+In general, static schedulers are less relavant to this project since they require input from the programmers that \CFA does not have as part of its concurrency semantic.
+\todo{Rate-monotonic scheduling}
+
+
+\section{Dynamic Scheduling}
+It may be difficult to fulfill the requirements of static scheduler if dependencies are be conditionnal. In this case, it may be preferable to detect dependencies at runtime. This detection effectively takes the form of halting or suspending a task with unfulfilled dependencies and adding one or more new task(s) to the system. The new task(s) have the responsability of adding the dependent task back in the system once completed. As a consequence, the scheduler may have an incomplete view of the system, seeing only tasks we no pending dependencies. Schedulers that support this detection at runtime are referred to as \newterm{Dynamic Schedulers}.
+
+\subsection{Explicitly Informed Dynamic Schedulers}
+While dynamic schedulers do not have access to an exhaustive list of dependencies for a task, they may require to provide more or less information about each task, including for example: expected duration, required ressources, relative importance, etc. The scheduler can then use this information to direct the scheduling decisions. \cit{Examples of schedulers with more information} Precisely providing this information can be difficult for programmers, especially \emph{predicted} behaviour, and the scheduler may need to support some amount of imprecision in the provided information. For example, specifying that a tasks takes approximately 5 seconds to complete, rather than exactly 5 seconds. User provided information can also become a significant burden depending how the effort to provide the information scales with the number of tasks and there complexity. For example, providing an exhaustive list of files read by 5 tasks is an easier requirement the providing an exhaustive list of memory addresses accessed by 10'000 distinct tasks.
+
+Since the goal of this thesis is to provide a scheduler as a replacement for \CFA's existing \emph{uninformed} scheduler, Explicitly Informed schedulers are less relevant to this project. Nevertheless, some strategies are worth mentionnding.
+
+\subsubsection{Prority Scheduling}
+A commonly used information that schedulers used to direct the algorithm is priorities. Each Task is given a priority and higher-priority tasks are preferred to lower-priority ones. The simplest priority scheduling algorithm is to simply require that every task have a distinct pre-established priority and always run the available task with the highest priority. Asking programmers to provide an exhaustive set of unique priorities can be prohibitive when the system has a large number of tasks. It can therefore be diserable for schedulers to support tasks with identical priorities and/or automatically setting and adjusting priorites for tasks.
+
+\subsection{Uninformed and Self-Informed Dynamic Schedulers}
+Several scheduling algorithms do not require programmers to provide additionnal information on each task, and instead make scheduling decisions based solely on internal state and/or information implicitly gathered by the scheduler.
+
+
+\subsubsection{Feedback Scheduling}
+As mentionned, Schedulers may also gather information about each tasks to direct their decisions. This design effectively moves the scheduler to some extent into the realm of \newterm{Control Theory}\cite{wiki:controltheory}. This gathering does not generally involve programmers and as such does not increase programmer burden the same way explicitly provided information may. However, some feedback schedulers do offer the option to programmers to offer additionnal information on certain tasks, in order to direct scheduling decision. The important distinction being whether or not the scheduler can function without this additionnal information.
+
+Feedback scheduler
+
+
+\section{Work Stealing}
+One of the most popular scheduling algorithm in practice (see~\ref{existing:prod}) is work-stealing. This idea, introduce by \cite{DBLP:conf/fpca/BurtonS81}, effectively has each worker work on its local tasks first, but allows the possibility for other workers to steal local tasks if they run out of tasks. \cite{DBLP:conf/focs/Blumofe94} introduced the more familiar incarnation of this, where each workers has queue of tasks to accomplish and workers without tasks steal tasks from random workers. (The Burton and Sleep algorithm had trees of tasks and stole only among neighbours). Blumofe and Leiserson also prove worst case space and time requirements for well-structured computations.
+
+Many variations of this algorithm have been proposed over the years\cite{DBLP:journals/ijpp/YangH18}, both optmizations of existing implementations and approaches that account for new metrics.
+
+\paragraph{Granularity} A significant portion of early Work Stealing research was concentrating on \newterm{Implicit Parellelism}\cite{wiki:implicitpar}. Since the system was responsible to split the work, granularity is a challenge that cannot be left to the programmers (as opposed to \newterm{Explicit Parellelism}\cite{wiki:explicitpar} where the burden can be left to programmers). In general, fine granularity is better for load balancing and coarse granularity reduces communication overhead. The best performance generally means finding a middle ground between the two. Several methods can be employed, but I believe these are less relevant for threads, which are generally explicit and more coarse grained.
+
+\paragraph{Task Placement} Since modern computers rely heavily on cache hierarchies\cit{Do I need a citation for this}, migrating tasks from one core to another can be .  \cite{DBLP:journals/tpds/SquillanteL93}
+
+\TODO{The survey is not great on this subject}
+
+\paragraph{Complex Machine Architecture} Another aspect that has been looked at is how well Work Stealing is applicable to different machine architectures.
+
+\subsection{Theoretical Results}
+There is also a large body of research on the theoretical aspects of work stealing. These evaluate, for example, the cost of migration\cite{DBLP:conf/sigmetrics/SquillanteN91,DBLP:journals/pe/EagerLZ86}, how affinity affects performance\cite{DBLP:journals/tpds/SquillanteL93,DBLP:journals/mst/AcarBB02,DBLP:journals/ipl/SuksompongLS16} and theoretical models for heterogenous systems\cite{DBLP:journals/jpdc/MirchandaneyTS90,DBLP:journals/mst/BenderR02,DBLP:conf/sigmetrics/GastG10}. \cite{DBLP:journals/jacm/BlellochGM99} examine the space bounds of Work Stealing and \cite{DBLP:journals/siamcomp/BerenbrinkFG03} show that for underloaded systems, the scheduler will complete computations in finite time, \ie is \newterm{stable}. Others show that Work-Stealing is applicable to various scheduling contexts\cite{DBLP:journals/mst/AroraBP01,DBLP:journals/anor/TchiboukdjianGT13,DBLP:conf/isaac/TchiboukdjianGTRB10,DBLP:conf/ppopp/AgrawalLS10,DBLP:conf/spaa/AgrawalFLSSU14}. \cite{DBLP:conf/ipps/ColeR13} also studied how Randomized Work Stealing affects false sharing among tasks.
+
+However, as \cite{DBLP:journals/ijpp/YangH18} highlights, it is worth mentionning that this theoretical research has mainly focused on ``fully-strict'' computations, \ie workloads that can be fully represented with a Direct Acyclic Graph. It is unclear how well these distributions represent workloads in real world scenarios.
+
+\section{Preemption}
+One last aspect of scheduling worth mentionning is preemption since many schedulers rely on it for some of their guarantees. Preemption is the idea of interrupting tasks that have been running for too long, effectively injecting suspend points in the applications. There are multiple techniques to achieve this but they all aim to have the effect of guaranteeing that suspend points in a task are never further apart than some fixed duration. While this helps schedulers guarantee that no tasks will unfairly monopolize a worker, preemption can effectively added to any scheduler. Therefore, the only interesting aspect of preemption for the design of scheduling is whether or not to require it.
+
+\section{Schedulers in Production}\label{existing:prod}
+This section will show a quick overview of several schedulers which are generally available a the time of writing. While these schedulers don't necessarily represent to most recent advances in scheduling, they are what is generally accessible to programmers. As such, I believe that these schedulers are at least as relevant as those presented in published work. I chose both schedulers that operating in kernel space and in user space, as both can offer relevant insight for this project. However, I did not list any schedulers aimed for real-time applications, as these have constraints that are much stricter than what is needed for this project.
+
+\subsection{Operating System Schedulers}
+Operating System Schedulers tend to be fairly complex schedulers, they generally support some amount of real-time, aim to balance interactive and non-interactive tasks and support for multiple users sharing hardware without requiring these users to cooperate. Here are more details on a few schedulers used in the common operating systems: Linux, FreeBsd, Microsoft Windows and Apple's OS X. The information is less complete for operating systems behind closed source.
+
+\paragraph{Linux's CFS}
+The default scheduler used by Linux (the Completely Fair Scheduler)\cite{MAN:linux/cfs,MAN:linux/cfs2} is a feedback scheduler based on CPU time. For each processor, it constructs a Red-Black tree of tasks waiting to run, ordering them by amount of CPU time spent. The scheduler schedules the task that has spent the least CPU time. It also supports the concept of \newterm{Nice values}, which are effectively multiplicative factors on the CPU time spent. The ordering of tasks is also impacted by a group based notion of fairness, where tasks belonging to groups having spent less CPU time are preferred to tasks beloning to groups having spent more CPU time. Linux achieves load-balancing by regularly monitoring the system state\cite{MAN:linux/cfs/balancing} and using some heuristic on the load (currently CPU time spent in the last millisecond plus decayed version of the previous time slots\cite{MAN:linux/cfs/pelt}.).
+
+\cite{DBLP:conf/eurosys/LoziLFGQF16} shows that Linux's CFS also does work-stealing to balance the workload of each processors, but the paper argues this aspect can be improved significantly. The issues highlighted sem to stem from Linux's need to support fairness across tasks \emph{and} across users\footnote{Enforcing fairness across users means, for example, that given two users: one with a single task and the other with one thousand tasks, the user with a single task does not receive one one thousandth of the CPU time.}, increasing the complexity.
+
+Linux also offers a FIFO scheduler, a real-time schedulerwhich runs the highest-priority task, and a round-robin scheduler, which is an extension of the fifo-scheduler that adds fixed time slices. \cite{MAN:linux/sched}
+
+\paragraph{FreeBSD}
+The ULE scheduler used in FreeBSD\cite{DBLP:conf/bsdcon/Roberson03} is a feedback scheduler similar to Linux's CFS. It uses different data structures and heuristics but also schedules according to some combination of CPU time spent and niceness values. It also periodically balances the load of the system(according to a different heuristic), but uses a simpler Work Stealing approach.
+
+\paragraph{Windows(OS)}
+Microsoft's Operating System's Scheduler\cite{MAN:windows/scheduler} is a feedback scheduler with priorities. It supports 32 levels of priorities, some of which are reserved for real-time and prviliged applications. It schedules tasks based on the highest priorities (lowest number) and how much cpu time each tasks have used. The scheduler may also temporarily adjust priorities after certain effects like the completion of I/O requests.
+
+\todo{load balancing}
+
+\paragraph{Apple OS X}
+Apple programming documentation describes its kernel scheduler as follows:
+\begin{displayquote}
+	OS X is based on Mach and BSD. [...]. It contains an advanced scheduler based on the CMU Mach 3 scheduler. [...] Mach scheduling is based on a system of run queues at various priorities.
+
+	\begin{flushright}
+		-- Kernel Programming Guide \cite{MAN:apple/scheduler}
+	\end{flushright}
+\end{displayquote}
+
+\todo{load balancing}
+
+\subsection{User-Level Schedulers}
+By comparison, user level schedulers tend to be simpler, gathering fewer metrics and avoid complex notions of fairness. Part of the simplicity is due to the fact that all tasks have the same user, and therefore cooperation is both feasible and probable.
+\paragraph{Go}
+Go's scheduler uses a Randomized Work Stealing algorithm that has a global runqueue(\emph{GRQ}) and each processor(\emph{P}) has both a fixed-size runqueue(\emph{LRQ}) and a high-priority next ``chair'' holding a single element.\cite{GITHUB:go,YTUBE:go} Preemption is present, but only at function call boundaries.
+
+The algorithm is as follows :
+\begin{enumerate}
+	\item Once out of 61 times, directly pick 1 element from the \emph{GRQ}.
+	\item If there is an item in the ``chair'' pick it.
+	\item Else pick an item from the \emph{LRQ}.
+	\item If it was empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ}.
+	\item If it was empty steal \emph{half} the \emph{LRQ} of another \emph{P} chosen randomly.
+\end{enumerate}
+
+\paragraph{Erlang}
+Erlang is a functionnal language that supports concurrency in the form of processes, threads that share no data. It seems to be some kind of Round-Robin Scheduler. It currently uses some mix of Work Sharing and Work Stealing to achieve load balancing\cite{:erlang}, where underloaded workers steal from other workers, but overloaded workers also push work to other workers. This migration logic seems to be directed by monitoring logic that evaluates the load a few times per seconds.
+
+\paragraph{Intel\textregistered ~Threading Building Blocks}
+\newterm{Thread Building Blocks}(TBB) is Intel's task parellelism\cite{wiki:taskparallel} framework. It runs tasks or \newterm{jobs}, schedulable objects that must always run to completion, on a pool of worker threads. TBB's scheduler is a variation of Randomized Work Stealing that also supports higher-priority graph-like dependencies\cite{MAN:tbb/scheduler}. It schedules tasks as follows (where \textit{t} is the last task completed):
+\begin{displayquote}
+	\begin{enumerate}
+		\item The task returned by \textit{t}\texttt{.execute()}
+		\item The successor of t if \textit{t} was its last completed predecessor.
+		\item A task popped from the end of the thread’s own deque.
+		\item A task with affinity for the thread.
+		\item A task popped from approximately the beginning of the shared queue.
+		\item A task popped from the beginning of another randomly chosen thread’s deque.
+	\end{enumerate}
+
+	\begin{flushright}
+		-- Intel\textregistered ~TBB documentation \cite{MAN:tbb/scheduler}
+	\end{flushright}
+\end{displayquote}
+
+\paragraph{Quasar/Project Loom}
+Java has two projects that are attempting to introduce lightweight threading into java in the form of Fibers, Quasar\cite{MAN:quasar} and Project Loom\cite{MAN:project-loom}\footnote{It is unclear to me if these are distinct projects or not}. Both projects seem to be based on the \texttt{ForkJoinPool} in Java which appears to be a simple incarnation of Randomized Work Stealing\cite{MAN:java/fork-join}.
+
+\paragraph{Grand Central Dispatch}
+This is an API produce by Apple\cit{Official GCD source} that offers task parellelism\cite{wiki:taskparallel}. Its distinctive aspect is that it uses multiple ``Dispatch Queues'', some of which are created by programmers. These queues each have their own local ordering guarantees, \eg tasks on queue $A$ are executed in \emph{FIFO} order.
+
+\todo{load balancing and scheduling}
+
+% http://web.archive.org/web/20090920043909/http://images.apple.com/macosx/technology/docs/GrandCentral_TB_brief_20090903.pdf
+
+In terms of semantics, the Dispatch Queues seem to be very similar in semantics to Intel\textregistered ~TBB \texttt{execute()} and predecessor semantics. Where it would be possible to convert from one to the other.
+
+\paragraph{LibFibre}
+LibFibre\cite{DBLP:journals/pomacs/KarstenB20} is a light-weight user-level threading framework developt at the University of Waterloo. Similarly to Go, it uses a variation of Work Stealing with a global queue that is higher priority than stealing. Unlock Go it does not have the high-priority next ``chair'' and does not use Randomized Work Stealing.
+
Index: doc/theses/thierry_delisle_PhD/thesis/text/front.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/front.tex	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/text/front.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -184,6 +184,10 @@
 \phantomsection		% allows hyperref to link to the correct page
 
+% TODOs and missing citations
+% -----------------------------
 \listofcits
 \listoftodos
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
 
 
Index: doc/theses/thierry_delisle_PhD/thesis/text/intro.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/intro.tex	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/text/intro.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -1,1 +1,9 @@
-\chapter{Introduction}
+\chapter*{Introduction}\label{intro}
+\todo{A proper intro}
+
+The C programming language\cit{C}
+
+The \CFA programming language\cite{cfa:frontpage,cfa:typesystem} which extends the C programming language to add modern safety and productiviy features while maintaining backwards compatibility. Among it's productiviy features, \CFA introduces support for threading\cit{CFA Concurrency}, to allow programmers to write modern concurrent and parallel programming.
+While previous work on the concurrent package of \CFA focused on features and interfaces, this thesis focuses on performance, introducing \glsxtrshort{api} changes only when required by performance considerations. More specifically, this thesis concentrates on scheduling and \glsxtrshort{io}. Prior to this work, the \CFA runtime used a strictly \glsxtrshort{fifo} \gls{rQ}.
+
+This work exclusively concentrates on Linux as it's operating system since the existing \CFA runtime and compiler does not already support other operating systems. Furthermore, as \CFA is yet to be released, supporting version of Linux older that the latest version is not a goal of this work.
Index: doc/theses/thierry_delisle_PhD/thesis/text/io.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/io.tex	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/text/io.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -1,5 +1,7 @@
-\chapter{I/O}
+\chapter{User Level \glsxtrshort{io}}
+As mentionned in Section~\ref{prev:io}, User-Level \glsxtrshort{io} requires multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc} using asynchronous \glsxtrshort{io} operations. Various operating systems offer various forms of asynchronous operations and as mentioned in Chapter~\ref{intro}, this work is exclusively focuesd on Linux.
 
 \section{Existing options}
+Since \glsxtrshort{io} operations are generally handled by the
 
 \subsection{\texttt{epoll}, \texttt{poll} and \texttt{select}}
@@ -7,7 +9,32 @@
 \subsection{Linux's AIO}
 
+
+
+\begin{displayquote}
+	AIO is a horrible ad-hoc design, with the main excuse being "other,
+	less gifted people, made that design, and we are implementing it for
+	compatibility because database people - who seldom have any shred of
+	taste - actually use it".
+
+	But AIO was always really really ugly.
+
+	\begin{flushright}
+		-- Linus Torvalds\cit{https://lwn.net/Articles/671657/}
+	\end{flushright}
+\end{displayquote}
+
+Interestingly, in this e-mail answer, Linus goes on to describe
+``a true \textit{asynchronous system call} interface''
+that does
+``[an] arbitrary system call X with arguments A, B, C, D asynchronously using a kernel thread''
+in
+``some kind of arbitrary \textit{queue up asynchronous system call} model''.
+This description is actually quite close to the interface of the interface described in the next section.
+
 \subsection{\texttt{io\_uring}}
+A very recent addition to Linux, \texttt{io\_uring}\cit{io\_uring} is a framework that aims to solve many of the problems listed with the above mentioned solutions.
 
-\subsection{Extra Kernel Threads}
+\subsection{Extra Kernel Threads}\label{io:morethreads}
+Finally, if the operating system does not offer any satisfying forms of asynchronous \glsxtrshort{io} operations, a solution is to fake it by creating a pool of \glspl{kthrd} and delegating operations to them in order to avoid blocking \glspl{proc}.
 
 \subsection{Discussion}
Index: doc/theses/thierry_delisle_PhD/thesis/text/practice.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/practice.tex	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/text/practice.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -2,9 +2,5 @@
 The scheduling algorithm discribed in Chapter~\ref{core} addresses scheduling in a stable state.
 However, it does not address problems that occur when the system changes state.
-Indeed the \CFA runtime, supports expanding and shrinking
-
-the number of KTHREAD\_place
-
-, both manually and, to some extent automatically.
+Indeed the \CFA runtime, supports expanding and shrinking the number of KTHREAD\_place \todo{add kthrd to glossary}, both manually and, to some extent automatically.
 This entails that the scheduling algorithm must support these transitions.
 
Index: doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/text/runtime.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -1,7 +1,44 @@
 \chapter{\CFA Runtime}
+This chapter offers an overview of the capabilities of the \CFA runtime prior to this work.
 
-\section{M:N Threading}
+Threading in \CFA offers is based on \Gls{uthrding}, where \glspl{thrd} are the representation of a unit of work. As such, \CFA programmers should expect these units to be fairly inexpensive, that is: programmers should be able to create a large number of \glspl{thrd} and switch between \glspl{thrd} liberally without many concerns for performance.
+
+\section{M:N Threading}\label{prev:model}
+
+C traditionnally uses a 1:1 threading model. This model uses \glspl{kthrd} to achive parallelism and concurrency. In this model, every thread of computation maps to an object in the kernel. The kernel then has the responsibility of managing these threads, \eg creating, scheduling, blocking. This also entails that the kernel has a perfect view of every thread executing in the system\footnote{This is not completly true due to primitives like \texttt{futex}es, which have a significant portion of their logic in user space.}.
+
+By contrast \CFA uses an M:N threading models, where concurrency is achieved using many user-level threads mapped onto fewer \glspl{kthrd}. The user-level threads have the same semantic meaning as a \glspl{kthrd} in the 1:1 model, they represent an independant thread of execution with it's on stack. The difference is that user-level threads do not have a corresponding object in the kernel, they are handled by the runtime in user space and scheduled onto \glspl{kthrd}, referred to as \glspl{proc} in this document. \Glspl{proc} run a \gls{thrd} until it context switches out, it then choses a different \gls{thrd} to run.
 
 \section{Clusters}
+\begin{figure}
+	\begin{center}
+		\input{system.pstex_t}
+	\end{center}
+	\caption{Overview of the \CFA runtime}
+	\label{fig:system}
+	\Glspl{thrd} are scheduled inside a particular cluster, where it only runs on the \glspl{proc} which belong to the cluster. The discrete-event manager, which handles preemption and timeout, is a \gls{kthrd} which lives outside any cluster and does not run \glspl{thrd}.
+\end{figure}
+\CFA allows the option to group user-level threading, in the form of clusters. Both \glspl{thrd} and \glspl{proc} belong to a specific cluster. \Glspl{thrd} will only be scheduled onto \glspl{proc} in the same cluster and scheduling is done independantly of other clusters. Figure~\ref{fig:system} shows an overview if this system. This allows programmers to control more tightly parallelism. It also opens the door to handling effects like NUMA, by pining clusters to specific NUMA node\footnote{This is not currently implemented in \CFA, but the only hurdle left is creating a generic interface for cpu masks.}.
+
+\section{Scheduling}
+The \CFA runtime was previously using a strictly \glsxtrshort{fifo} ready queue with a single lock. This setup offers perfect fairness in terms of opportunities to run/ However, it offers poor scalability, since the performance of the ready queue can never be improved by adding more \glspl{hthrd}, but the contention can cause significant performance degradation.
+
+\section{\glsxtrshort{io}}\label{prev:io}
+Prior to this work, the \CFA runtime did not add any particular support for \glsxtrshort{io} operations. \CFA being built on C, this means that, while all the operations available in C are available in \CFA, \glsxtrshort{io} operations are designed for the POSIX threading model\cit{pthreads}. Using these operations in a M:N threading model, when they are built for 1:1 threading, means that operations block \glspl{proc} instead of \glspl{thrd}. While this can work in certain cases, it limits the number of concurrent operations to the number of \glspl{proc} rather than \glspl{thrd}. This also means that deadlocks can occur because all \glspl{proc} are blocked even if at least one \gls{thrd} is ready to run. A simple example of this type of deadlock would be as follows:
+
+Given a simple network program with 2 \glspl{thrd} and a single \gls{proc}, one \gls{thrd} sends network requests to a server and the other \gls{thrd} waits for response from the server. If the second \gls{thrd} races ahead, it may wait for responses to requests that have not been sent yet. In theory, this should not be a problem, even if the second \gls{thrd} waits, the first \gls{thrd} is still ready to run and should just be able to get CPU time and send the request. In practice with M:N threading, while the first \gls{thrd} is ready, the lone \gls{proc} in this example will \emph{not} try to run the first \gls{thrd} if it is blocked in the \glsxtrshort{io} operation of the second \gls{thrd}. If this happen, the system is effectively deadlocked\footnote{In this example, the deadlocked could be resolved if the server sends unprompted messages to the client. However, this solution is not general and may not be appropriate even in this simple case.}.
+
+One of the objective of this work, is to introduce \emph{User-Level \glsxtrshort{io}} which, as a parallel to \glslink{uthrding}{User-Level \emph{Threading}}, blocks \glspl{thrd} rather than \glspl{proc} when doing \glsxtrshort{io} operations. This entails multiplexing the \glsxtrshort{io} operations of many \glspl{thrd} onto fewer \glspl{proc}. This multiplexing requires that a single \gls{proc} be able to execute multiple operations in parallel. This cannot be done with operations that block \glspl{proc}, \ie \glspl{kthrd}, since the first operation would prevent starting new operations for its duration. Executing operations in parallel requires \emph{asynchronous} \glsxtrshort{io}, sometimes referred to as \emph{non-blocking}, since the \gls{kthrd} is not blocked.
 
 \section{Interoperating with \texttt{C}}
+While \glsxtrshort{io} operations are the classical example of operations that block \glspl{kthrd}, the challenges mentioned in the previous section do not require \glsxtrshort{io} to be involved. These challenges are a product of blocking system calls rather than \glsxtrshort{io}. C offers no tools to identify whether or not a librairy function will lead to a blocking system call. This fact means interoperatability with C becomes a challenge in a M:N threading model.
+
+Languages like Go and Java, which have strict interoperatability with C\cit{JNI, GoLang with C}, can control operations in C by ``sandboxing'' them. They can, for example, delegate C operations to \glspl{kthrd} that are not \glspl{proc}. Sandboxing may help towards guaranteeing that the deadlocks mentioned in the previous section do not occur.
+
+As mentioned in Section~\cit{\CFA intro}, \CFA is binary compatible with C and, as such, trivially supports calls to and from C librairies. Furthermore, interoperatability can happen within a single library, through inline code or simply C and \CFA translation units archived together. The fine-grained interoperatability between C and \CFA has two consequences:
+\begin{enumerate}
+	\item Precisely identifying C calls that could block is difficult.
+	\item Introducing code where interoperatability occurs could have a significant impact on general performance.
+\end{enumerate}
+
+Because of these consequences, this work does not attempt to ``sandbox'' calls to C. It is possible that conflicting calls to C could lead to deadlocks on \CFA's M:N threading model where they would not in the traditionnal 1:1 threading model. However, I judge that solving this problem in general, in a way that is composable and flexible, is too complex in itself and would add too much work to this thesis. Therefore it is outside the scope of this thesis.
Index: doc/theses/thierry_delisle_PhD/thesis/thesis.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/thesis/thesis.tex	(revision 223a63306c486d5cffd07812bcf08afea940d3c4)
+++ doc/theses/thierry_delisle_PhD/thesis/thesis.tex	(revision b9537e6e28ae020f54097b3d63e5a8bebeaf1cde)
@@ -121,4 +121,7 @@
 % installation instructions there.
 
+\usepackage{csquotes}
+\usepackage{indentfirst} % as any self-respecting frenchman would
+
 % Setting up the page margins...
 % uWaterloo thesis requirements specify a minimum of 1 inch (72pt) margin at the
@@ -218,9 +221,17 @@
 % separate documents, they would each start with the \chapter command, i.e,
 % do not contain \documentclass or \begin{document} and \end{document} commands.
+\part{Introduction}
 \input{text/intro.tex}
+\input{text/existing.tex}
 \input{text/runtime.tex}
+\part{Design}
 \input{text/core.tex}
 \input{text/practice.tex}
 \input{text/io.tex}
+\part{Evaluation}
+\chapter{Theoretical and Existance Proofs}
+\chapter{Micro-Benchmarks}
+\chapter{Larger-Scale applications}
+\part{Conclusion \& Annexes}
 
 %----------------------------------------------------------------------
@@ -245,11 +256,11 @@
 \addcontentsline{toc}{chapter}{\textbf{References}}
 
-\bibliography{uw-ethesis}
+\bibliography{local}
 % Tip 5: You can create multiple .bib files to organize your references.
 % Just list them all in the \bibliogaphy command, separated by commas (no spaces).
 
-% The following statement causes the specified references to be added to the bibliography% even if they were not
-% cited in the text. The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
-\nocite{*}
+% % The following statement causes the specified references to be added to the bibliography% even if they were not
+% % cited in the text. The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
+% \nocite{*}
 
 % The \appendix statement indicates the beginning of the appendices.