Index: doc/papers/ibm_CASCON19/ThreadingModels.fig
===================================================================
--- doc/papers/ibm_CASCON19/ThreadingModels.fig	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/papers/ibm_CASCON19/ThreadingModels.fig	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,140 @@
+#FIG 3.2  Produced by xfig version 3.2.5b
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2700 3300 150 150 2700 3300 2850 3300
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3150 3300 150 150 3150 3300 3300 3300
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4050 3300 150 150 4050 3300 4200 3300
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4500 3300 150 150 4500 3300 4650 3300
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 4950 3300 150 150 4950 3300 5100 3300
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5400 3300 150 150 5400 3300 5550 3300
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 5850 3300 150 150 5850 3300 6000 3300
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 3600 3300 150 150 3600 3300 3750 3300
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 6450 3150 100 100 6450 3150 6550 3150
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 1800 2175 150 150 1800 2175 1950 2175
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 3000 2175 150 150 3000 2175 3150 2175
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 3450 2175 150 150 3450 2175 3600 2175
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 3900 2175 150 150 3900 2175 4050 2175
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 5100 2175 150 150 5100 2175 5250 2175
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 6300 2175 150 150 6300 2175 6450 2175
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 6750 2175 150 150 6750 2175 6900 2175
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 7200 2175 150 150 7200 2175 7350 2175
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 1800 2175 100 100 1800 2175 1900 2175
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 3000 2175 100 100 3000 2175 3100 2175
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 3450 2175 100 100 3450 2175 3550 2175
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 3900 2175 100 100 3900 2175 4000 2175
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 4650 1425 100 100 4650 1425 4750 1425
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 4950 1425 100 100 4950 1425 5050 1425
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 5250 1425 100 100 5250 1425 5350 1425
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 5550 1425 100 100 5550 1425 5650 1425
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 6300 1425 100 100 6300 1425 6400 1425
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 6600 1425 100 100 6600 1425 6700 1425
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 6900 1425 100 100 6900 1425 7000 1425
+1 3 0 1 0 7 50 -1 15 0.000 1 0.0000 7200 1425 100 100 7200 1425 7300 1425
+1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 6450 3600 150 150 6450 3600 6600 3600
+1 3 1 1 0 7 50 -1 -1 4.000 1 0.0000 6450 3600 100 100 6450 3600 6550 3600
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 1800 2175 2700 3300
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 3000 2175 3150 3300
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 3450 2175 3600 3300
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 3900 2175 4050 3300
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4500 3300 5100 2175
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4950 3300 6300 2175
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 5400 3300 6750 2175
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 5850 3300 7200 2175
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 2775 3450 3825 3675
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 3225 3450 3900 3600
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 3675 3450 4050 3600
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4125 3450 4200 3600
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4500 3450 4350 3600
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4950 3450 4500 3600
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 5400 3450 4650 3600
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 5850 3450 4725 3675
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 3825 3750 3375 3900
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4200 3750 3975 3900
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4350 3750 4575 3900
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4650 3750 5175 3900
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 3225 3900 3525 3900 3525 4200 3225 4200 3225 3900
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 3825 3900 4125 3900 4125 4200 3825 4200 3825 3900
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 4425 3900 4725 3900 4725 4200 4425 4200 4425 3900
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 5025 3900 5325 3900 5325 4200 5025 4200 5025 3900
+2 2 0 2 4 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 2400 3000 6150 3000 6150 4350 2400 4350 2400 3000
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 6300 3975 6600 3975 6600 4275 6300 4275 6300 3975
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4950 1500 5100 1650
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 5250 1500 5100 1650
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 4650 1500 4875 1650
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 5550 1500 5325 1650
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 5100 1800 5100 2025
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 6600 1500 6750 1650
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 6900 1500 6750 1650
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 6300 1500 6525 1650
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 7200 1500 6975 1650
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 6750 1800 6750 2025
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 4500 1275 5700 1275 5700 2400 4500 2400 4500 1275
+2 2 0 2 4 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 6000 1275 7500 1275 7500 2400 6000 2400 6000 1275
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 2700 1275 4200 1275 4200 2400 2700 2400 2700 1275
+2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
+	 1200 1275 2400 1275 2400 2400 1200 2400 1200 1275
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 6600 1800 6300 2025
+2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
+	 6900 1800 7200 2025
+4 1 0 50 -1 0 12 0.0000 2 135 435 1800 2775 1:1:4\001
+4 1 0 50 -1 0 12 0.0000 2 135 435 3450 2775 3:3:4\001
+4 1 0 50 -1 0 12 0.0000 2 135 435 5100 2775 4:1:4\001
+4 1 0 50 -1 0 12 0.0000 2 135 435 6750 2775 4:3:4\001
+4 2 0 50 -1 0 12 0.0000 2 180 585 2250 3975 System\001
+4 2 0 50 -1 0 12 0.0000 2 180 810 2250 3750 Operating\001
+4 1 0 50 -1 0 12 0.0000 2 135 780 4275 3750 scheduler\001
+4 0 0 50 -1 0 12 0.0000 2 135 885 6750 3225 user thread\001
+4 0 0 50 -1 0 12 0.0000 2 135 1065 6750 3675 kernel thread\001
+4 0 0 50 -1 0 12 0.0000 2 135 375 6750 4200 CPU\001
+4 1 0 50 -1 0 12 0.0000 2 180 1020 1800 1200 Process$_1$\001
+4 1 0 50 -1 0 12 0.0000 2 180 1020 3450 1200 Process$_2$\001
+4 1 0 50 -1 0 12 0.0000 2 180 1020 5100 1200 Process$_3$\001
+4 1 0 50 -1 0 12 0.0000 2 180 1020 6750 1200 Process$_4$\001
+4 1 0 50 -1 0 11 0.0000 2 120 675 5100 1800 scheduler\001
+4 1 0 50 -1 0 11 0.0000 2 120 675 6750 1800 scheduler\001
Index: doc/papers/ibm_CASCON19/ThreadingModels.svg
===================================================================
--- doc/papers/ibm_CASCON19/ThreadingModels.svg	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/papers/ibm_CASCON19/ThreadingModels.svg	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,683 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   xmlns:dc="http://purl.org/dc/elements/1.1/"
+   xmlns:cc="http://creativecommons.org/ns#"
+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+   xmlns:svg="http://www.w3.org/2000/svg"
+   xmlns="http://www.w3.org/2000/svg"
+   version="1.1"
+   id="svg4438"
+   viewBox="1188 1041 6669 3331"
+   height="2.8in"
+   width="5.6in">
+  <metadata
+     id="metadata4640">
+    <rdf:RDF>
+      <cc:Work
+         rdf:about="">
+        <dc:format>image/svg+xml</dc:format>
+        <dc:type
+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
+        <dc:title></dc:title>
+      </cc:Work>
+    </rdf:RDF>
+  </metadata>
+  <defs
+     id="defs4638" />
+  <g
+     transform="translate(185.25001,-26.464286)"
+     id="g4440"
+     style="fill:none;stroke-width:0.025in">
+    <!-- Circle -->
+    <circle
+       id="circle4442"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3300"
+       cx="2700" />
+    <!-- Circle -->
+    <circle
+       id="circle4444"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3300"
+       cx="3150" />
+    <!-- Circle -->
+    <circle
+       id="circle4446"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3300"
+       cx="4050" />
+    <!-- Circle -->
+    <circle
+       id="circle4448"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3300"
+       cx="4500" />
+    <!-- Circle -->
+    <circle
+       id="circle4450"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3300"
+       cx="4950" />
+    <!-- Circle -->
+    <circle
+       id="circle4452"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3300"
+       cx="5400" />
+    <!-- Circle -->
+    <circle
+       id="circle4454"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3300"
+       cx="5850" />
+    <!-- Circle -->
+    <circle
+       id="circle4456"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3300"
+       cx="3600" />
+    <!-- Circle -->
+    <circle
+       id="circle4458"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="3150"
+       cx="6450" />
+    <!-- Circle -->
+    <circle
+       id="circle4460"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="150"
+       cy="2175"
+       cx="1800" />
+    <!-- Circle -->
+    <circle
+       id="circle4462"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="150"
+       cy="2175"
+       cx="3000" />
+    <!-- Circle -->
+    <circle
+       id="circle4464"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="150"
+       cy="2175"
+       cx="3450" />
+    <!-- Circle -->
+    <circle
+       id="circle4466"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="150"
+       cy="2175"
+       cx="3900" />
+    <!-- Circle -->
+    <circle
+       id="circle4468"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="150"
+       cy="2175"
+       cx="5100" />
+    <!-- Circle -->
+    <circle
+       id="circle4470"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="150"
+       cy="2175"
+       cx="6300" />
+    <!-- Circle -->
+    <circle
+       id="circle4472"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="150"
+       cy="2175"
+       cx="6750" />
+    <!-- Circle -->
+    <circle
+       id="circle4474"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="150"
+       cy="2175"
+       cx="7200" />
+    <!-- Circle -->
+    <circle
+       id="circle4476"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="2175"
+       cx="1800" />
+    <!-- Circle -->
+    <circle
+       id="circle4478"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="2175"
+       cx="3000" />
+    <!-- Circle -->
+    <circle
+       id="circle4480"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="2175"
+       cx="3450" />
+    <!-- Circle -->
+    <circle
+       id="circle4482"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="2175"
+       cx="3900" />
+    <!-- Circle -->
+    <circle
+       id="circle4484"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="1425"
+       cx="4650" />
+    <!-- Circle -->
+    <circle
+       id="circle4486"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="1425"
+       cx="4950" />
+    <!-- Circle -->
+    <circle
+       id="circle4488"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="1425"
+       cx="5250" />
+    <!-- Circle -->
+    <circle
+       id="circle4490"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="1425"
+       cx="5550" />
+    <!-- Circle -->
+    <circle
+       id="circle4492"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="1425"
+       cx="6300" />
+    <!-- Circle -->
+    <circle
+       id="circle4494"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="1425"
+       cx="6600" />
+    <!-- Circle -->
+    <circle
+       id="circle4496"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="1425"
+       cx="6900" />
+    <!-- Circle -->
+    <circle
+       id="circle4498"
+       style="fill:#bfbfbf;stroke:#000000;stroke-width:7"
+       r="100"
+       cy="1425"
+       cx="7200" />
+    <!-- Circle -->
+    <circle
+       id="circle4500"
+       style="stroke:#000000;stroke-width:7"
+       r="150"
+       cy="3600"
+       cx="6450" />
+    <!-- Circle -->
+    <circle
+       id="circle4502"
+       style="stroke:#000000;stroke-width:7;stroke-dasharray:40, 40"
+       r="100"
+       cy="3600"
+       cx="6450" />
+    <!-- Line -->
+    <polyline
+       id="polyline4504"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="1800,2175 2700,3300 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4506"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="3000,2175 3150,3300 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4508"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="3450,2175 3600,3300 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4510"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="3900,2175 4050,3300 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4512"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4500,3300 5100,2175 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4514"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4950,3300 6300,2175 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4516"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="5400,3300 6750,2175 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4518"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="5850,3300 7200,2175 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4520"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="2775,3450 3825,3675 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4522"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="3225,3450 3900,3600 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4524"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="3675,3450 4050,3600 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4526"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4125,3450 4200,3600 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4528"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4500,3450 4350,3600 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4530"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4950,3450 4500,3600 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4532"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="5400,3450 4650,3600 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4534"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="5850,3450 4725,3675 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4536"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="3825,3750 3375,3900 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4538"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4200,3750 3975,3900 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4540"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4350,3750 4575,3900 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4542"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4650,3750 5175,3900 " />
+    <!-- Line: box -->
+    <rect
+       id="rect4544"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="300"
+       width="300"
+       y="3900"
+       x="3225" />
+    <!-- Line: box -->
+    <rect
+       id="rect4546"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="300"
+       width="300"
+       y="3900"
+       x="3825" />
+    <!-- Line: box -->
+    <rect
+       id="rect4548"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="300"
+       width="300"
+       y="3900"
+       x="4425" />
+    <!-- Line: box -->
+    <rect
+       id="rect4550"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="300"
+       width="300"
+       y="3900"
+       x="5025" />
+    <!-- Line: box -->
+    <rect
+       id="rect4552"
+       style="stroke:#ff0000;stroke-width:15;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="1350"
+       width="3750"
+       y="3000"
+       x="2400" />
+    <!-- Line: box -->
+    <rect
+       id="rect4554"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="300"
+       width="300"
+       y="3975"
+       x="6300" />
+    <!-- Line -->
+    <polyline
+       id="polyline4556"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4950,1500 5100,1650 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4558"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="5250,1500 5100,1650 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4560"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="4650,1500 4875,1650 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4562"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="5550,1500 5325,1650 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4564"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="5100,1800 5100,2025 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4566"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="6600,1500 6750,1650 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4568"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="6900,1500 6750,1650 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4570"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="6300,1500 6525,1650 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4572"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="7200,1500 6975,1650 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4574"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="6750,1800 6750,2025 " />
+    <!-- Line: box -->
+    <rect
+       id="rect4576"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="1125"
+       width="1200"
+       y="1275"
+       x="4500" />
+    <!-- Line: box -->
+    <rect
+       id="rect4578"
+       style="stroke:#ff0000;stroke-width:15;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="1125"
+       width="1500"
+       y="1275"
+       x="6000" />
+    <!-- Line: box -->
+    <rect
+       id="rect4580"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="1125"
+       width="1500"
+       y="1275"
+       x="2700" />
+    <!-- Line: box -->
+    <rect
+       id="rect4582"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       rx="0"
+       height="1125"
+       width="1200"
+       y="1275"
+       x="1200" />
+    <!-- Line -->
+    <polyline
+       id="polyline4584"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="6600,1800 6300,2025 " />
+    <!-- Line -->
+    <polyline
+       id="polyline4586"
+       style="stroke:#000000;stroke-width:7;stroke-linecap:butt;stroke-linejoin:miter"
+       points="6900,1800 7200,2025 " />
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4588"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="2775"
+       x="1800"
+       xml:space="preserve">1:1</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4590"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="2775"
+       x="3450"
+       xml:space="preserve">3:3</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4592"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="2775"
+       x="5100"
+       xml:space="preserve">4:1</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4594"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="2775"
+       x="6750"
+       xml:space="preserve">4:3</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:end;fill:#000000"
+       id="text4596"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="3975"
+       x="2250"
+       xml:space="preserve">System</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:end;fill:#000000"
+       id="text4598"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="3750"
+       x="2250"
+       xml:space="preserve">Operating</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4600"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="3750"
+       x="4275"
+       xml:space="preserve">scheduler</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:start;fill:#000000"
+       id="text4602"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="3225"
+       x="6750"
+       xml:space="preserve">user thread</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:start;fill:#000000"
+       id="text4604"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="3675"
+       x="6750"
+       xml:space="preserve">kernel thread</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:start;fill:#000000"
+       id="text4606"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="4200"
+       x="6750"
+       xml:space="preserve">CPU</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4608"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="1200"
+       x="1800"
+       xml:space="preserve">Process<tspan
+   style="font-size:96px"
+   id="tspan4610"
+   dy="35"
+   font-size="96">1</tspan><tspan
+   id="tspan4612"
+   dy="-35" /></text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4614"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="1200"
+       x="3450"
+       xml:space="preserve">Process<tspan
+   style="font-size:96px"
+   id="tspan4616"
+   dy="35"
+   font-size="96">2</tspan><tspan
+   id="tspan4618"
+   dy="-35" /></text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4620"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="1200"
+       x="5100"
+       xml:space="preserve">Process<tspan
+   style="font-size:96px"
+   id="tspan4622"
+   dy="35"
+   font-size="96">3</tspan><tspan
+   id="tspan4624"
+   dy="-35" /></text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:144px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4626"
+       font-size="144"
+       font-weight="normal"
+       font-style="normal"
+       y="1200"
+       x="6750"
+       xml:space="preserve">Process<tspan
+   style="font-size:96px"
+   id="tspan4628"
+   dy="35"
+   font-size="96">4</tspan><tspan
+   id="tspan4630"
+   dy="-35" /></text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:132px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4632"
+       font-size="132"
+       font-weight="normal"
+       font-style="normal"
+       y="1800"
+       x="5100"
+       xml:space="preserve">scheduler</text>
+    <!-- Text -->
+    <text
+       style="font-style:normal;font-weight:normal;font-size:132px;font-family:Times;text-anchor:middle;fill:#000000"
+       id="text4634"
+       font-size="132"
+       font-weight="normal"
+       font-style="normal"
+       y="1800"
+       x="6750"
+       xml:space="preserve">scheduler</text>
+  </g>
+</svg>
Index: doc/papers/ibm_CASCON19/abstract.txt
===================================================================
--- doc/papers/ibm_CASCON19/abstract.txt	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/papers/ibm_CASCON19/abstract.txt	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,40 @@
+Synchronous Programming with User-Level Threads in C∀
+
+When computations cannot be satisfied immediately, programmers must use
+one of two paradigms, Synchronous Programming and Asynchronous Programming.
+This presentation discusses the benefits Synchronous Programming over
+Asynchronous Programming and what requirements exist to use this paradigm.
+It also discusses how C∀, a concurrent and backwards-compatible extension
+of the C programming language, has powerful tools available for simple and
+efficient Synchronous Programming. These tools include user-level threads
+and high-level locking mechanisms which simply solve common problems.
+Finally, the presentation shows different techniques to combine these
+features with existing code to either uses blocking operations provided by
+the kernel or is built according to an asynchronous paradigm.
+
+
+
+
+
+
+
+
+
+C∀ is a polymorphic, non-object-oriented, concurrent, backwards-compatible
+extension of the C programming language. This paper discusses the design
+philosophy and implementation of its advanced control-flow and concurrent/parallel
+features, along with the supporting runtime written in C∀. These features
+are created from scratch as ISO C has only low-level and/or unimplemented
+concurrency, so C programmers continue to rely on library features like pthreads.
+C∀ introduces modern language-level control-flow mechanisms, like generators,
+coroutines, user-level threading, and monitors for mutual exclusion and
+synchronization. The runtime provides significant programmer simplification
+and safety by eliminating spurious wakeup and monitor barging. The runtime
+also ensures multiple monitors can be safely acquired simultaneously (deadlock free),
+and this feature is fully integrated with all monitor synchronization mechanisms.
+All control-flow features integrate with the C∀ polymorphic type-system and
+exception handling, while respecting the expectations and style of C programmers.
+Experimental results show comparable performance of the new features with similar
+mechanisms in other concurrent programming languages.
+KEYWORDS
+generator, coroutine, concurrency, parallelism, thread, monitor, runtime, C, C∀ (Cforall)
Index: doc/papers/ibm_CASCON19/client.cfa
===================================================================
--- doc/papers/ibm_CASCON19/client.cfa	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/papers/ibm_CASCON19/client.cfa	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,63 @@
+// Original
+void sum_many_async(int[] inputs, lambda_t callback);
+
+// Target
+int  sum_many      (int[] inputs);
+
+//==================================
+
+int  sum_many(int[] inputs) {
+	// setup required data
+	int result;
+	semaphore_t sem = { 0 };
+
+	// call async function
+	sum_many_async( inputs, (int sum) {
+		result = sum;
+		V(sem);
+	});
+
+	// wait for result
+	P(sem);
+
+	// return
+	return result;
+}
+
+
+int sum_many_block(int[] inputs);
+int sum_many      (int[] inputs);
+
+//==================================
+
+struct request_t {
+	int[] inputs;
+	int result;
+	semaphore_t sem = { 0 };
+};
+
+void * call_sum_many_block(void * args) {
+	request_t * req = (request_t *) args;
+	req->result = sum_many_block(req->inputs);
+	V(req->sem);
+}
+
+int  sum_many(int[] inputs) {
+	// setup required data
+	request_t req = { inputs };
+
+	// create thread to
+	pthread_t thrd;
+	int err = pthread_fork(&thrd, 0p, call_sum_many_block, &req);
+	handle(err);
+
+	// wait
+	P(req.sem);
+
+	// Cleanup pthread
+	err = pthread_join(&thrd, 0p);
+	handle(err);
+
+	// get result
+	return req.result;
+}
Index: doc/papers/ibm_CASCON19/server.cfa
===================================================================
--- doc/papers/ibm_CASCON19/server.cfa	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/papers/ibm_CASCON19/server.cfa	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,13 @@
+
+
+volatile bool shutdown = false;
+
+large_compute_async(unsigned code, int lhs, int rhs, lambda_t callback);
+
+
+void main() {
+	while(!shutdown) {
+
+
+	}
+}
Index: doc/theses/thierry_delisle_PhD/.gitignore
===================================================================
--- doc/theses/thierry_delisle_PhD/.gitignore	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/.gitignore	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,13 @@
+*/a.out
+
+code/*.s
+code/relaxed_list
+code/layout.ast
+code/build/
+code/raw/
+
+comp_II/build/
+comp_II/comp_II.pdf
+comp_II/comp_II.ps
+
+!Makefile
Index: doc/theses/thierry_delisle_PhD/code/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/code/Makefile	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/code/Makefile	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,22 @@
+
+
+CXXFLAGS = -O3 -g -Wall -Wextra -std=c++17
+LDFLAGS = -pthread -latomic
+
+push:
+	clang++ relaxed_list.cpp -g -Wall -Wextra -std=c++17 -fsyntax-only &&  rsync -av relaxed_list.cpp relaxed_list.hpp utils.hpp assert.hpp scale.sh plg7b:~/workspace/sched/.
+
+relaxed_list: $(firstword $(MAKEFILE_LIST)) | build
+	clang++ relaxed_list.cpp $(CXXFLAGS) $(LDFLAGS) -lpng -MMD -MF build/$(@).d -o $(@)
+
+-include build/relaxed_list.d
+
+layout.ast: $(firstword $(MAKEFILE_LIST)) | build
+	clang++ relaxed_list_layout.cpp $(CXXFLAGS) -MMD -MF build/$(@).d -MT $(@) -E -o build/$(@).ii
+	clang++ -Xclang -fdump-record-layouts -fsyntax-only $(CXXFLAGS) build/$(@).ii > build/layout.ast.raw
+	cat build/$(@).raw > $(@)
+
+-include build/layout.ast.d
+
+build:
+	mkdir -p build
Index: doc/theses/thierry_delisle_PhD/code/bts_test.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/bts_test.cpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/code/bts_test.cpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,32 @@
+#include <cassert>
+#include <iostream>
+
+bool bts(volatile size_t & target, size_t bit ) {
+	bool result = false;
+	asm volatile(
+		"LOCK btsq %[bit], %[target]\n\t"
+		:"=c" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result;
+}
+
+bool btr(volatile size_t & target, size_t bit ) {
+	bool result = false;
+	asm volatile(
+		"LOCK btrq %[bit], %[target]\n\t"
+		:"=c" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result;
+}
+
+int main() {
+	volatile size_t i = 0;
+	std::cout << std::hex << i << std::endl;
+	assert(bts(i, 31));
+	std::cout << std::hex << i << std::endl;
+	assert(btr(i, 31));
+	std::cout << std::hex << i << std::endl;
+	return 0;
+}
Index: doc/theses/thierry_delisle_PhD/code/randbit.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/randbit.cpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/code/randbit.cpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,236 @@
+#include <cstddef>
+#include <cstdint>
+#include <x86intrin.h>
+
+__attribute__((noinline)) unsigned nthSetBit(size_t mask, unsigned bit) {
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
+	a =  v - ((v >> 1) & ~0UL/3);
+	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	// if (r > t) {s -= 32; r -= t;}
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	// if (r > t) {s -= 16; r -= t;}
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	// if (r > t) {s -= 8; r -= t;}
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	// if (r > t) {s -= 4; r -= t;}
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	// if (r > t) {s -= 2; r -= t;}
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	// if (r > t) s--;
+	s -= ((t - r) & 256) >> 8;
+	// s = 65 - s;
+	return s;
+}
+
+unsigned rand_bit(unsigned rnum, uint64_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if defined(BRANCHLESS)
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	// a = (v & 0x5555...) + ((v >> 1) & 0x5555...);
+	a =  v - ((v >> 1) & ~0UL/3);
+	// b = (a & 0x3333...) + ((a >> 2) & 0x3333...);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	// c = (b & 0x0f0f...) + ((b >> 4) & 0x0f0f...);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	// d = (c & 0x00ff...) + ((c >> 8) & 0x00ff...);
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	// if (r > t) {s -= 32; r -= t;}
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	// if (r > t) {s -= 16; r -= t;}
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	// if (r > t) {s -= 8; r -= t;}
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	// if (r > t) {s -= 4; r -= t;}
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	// if (r > t) {s -= 2; r -= t;}
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	// if (r > t) s--;
+	s -= ((t - r) & 256) >> 8;
+	// s = 65 - s;
+	return s - 1;
+#elif defined(LOOP)
+	for(unsigned i = 0; i < bit; i++) {
+		mask ^= (1ul << (__builtin_ffsl(mask) - 1ul));
+	}
+	return __builtin_ffsl(mask) - 1ul;
+#elif defined(PDEP)
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return __builtin_ffsl(picked) - 1ul;
+#else
+#error must define LOOP, PDEP or BRANCHLESS
+#endif
+}
+
+#include <cassert>
+#include <atomic>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <locale>
+#include <thread>
+
+#include <unistd.h>
+
+class barrier_t {
+public:
+	barrier_t(size_t total)
+		: waiting(0)
+		, total(total)
+	{}
+
+	void wait(unsigned) {
+		size_t target = waiting++;
+		target = (target - (target % total)) + total;
+		while(waiting < target)
+			asm volatile("pause");
+
+		assert(waiting < (1ul << 60));
+    	}
+
+private:
+	std::atomic<size_t> waiting;
+	size_t total;
+};
+
+class Random {
+private:
+	unsigned int seed;
+public:
+	Random(int seed) {
+		this->seed = seed;
+	}
+
+	/** returns pseudorandom x satisfying 0 <= x < n. **/
+	unsigned int next() {
+		seed ^= seed << 6;
+		seed ^= seed >> 21;
+		seed ^= seed << 7;
+		return seed;
+    	}
+};
+
+using Clock = std::chrono::high_resolution_clock;
+using duration_t = std::chrono::duration<double>;
+using std::chrono::nanoseconds;
+
+template<typename Ratio, typename T>
+T duration_cast(T seconds) {
+	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
+}
+
+void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
+
+
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		auto now = Clock::now();
+		duration_t durr = now - before;
+		if( durr.count() > duration ) {
+			done = true;
+			break;
+		}
+		std::cout << "\r" << std::setprecision(4) << durr.count();
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+__attribute__((noinline)) void body(Random & rand) {
+	uint64_t mask = (uint64_t(rand.next()) << 32ul) | uint64_t(rand.next());
+	unsigned idx = rand.next();
+
+	unsigned bit = rand_bit(idx, mask);
+
+	if(__builtin_expect(((1ul << bit) & mask) == 0, false)) {
+		std::cerr << std::hex <<  "Rand " << idx << " from " << mask;
+		std::cerr << " gave " << (1ul << bit) << "(" << std::dec << bit << ")" << std::endl;
+		std::abort();
+	}
+}
+
+void runRandBit(double duration) {
+
+	std::atomic_bool done  = { false };
+	barrier_t barrier(2);
+
+	size_t count = 0;
+	std::thread thread([&done, &barrier, &count]() {
+
+		Random rand(22);
+
+		barrier.wait(1);
+
+		for(;!done; count++) {
+			body(rand);
+		}
+
+		barrier.wait(1);
+	});
+
+	waitfor(duration, barrier, done);
+	thread.join();
+
+	size_t ops = count;
+	size_t ops_sec = size_t(double(ops) / duration);
+	auto dur_nano = duration_cast<std::nano>(1.0);
+
+	std::cout << "Duration      : " << duration << "s\n";
+	std::cout << "ns/Op         : " << ( dur_nano / ops )<< "\n";
+	std::cout << "Ops/sec       : " << ops_sec << "\n";
+	std::cout << "Total ops     : " << ops << std::endl;
+
+}
+
+int main() {
+	std::cout.imbue(std::locale(""));
+	runRandBit(5);
+}
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision 92a97685f5aabd9745b94f924132f4ef5f9d6868)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -9,4 +9,5 @@
 #include <vector>
 
+#include <getopt.h>
 #include <unistd.h>
 #include <sys/sysinfo.h>
@@ -21,11 +22,9 @@
 
 	int value;
-	Node(int value): value(value) {
-		creates++;
-	}
-
-	~Node() {
-		destroys++;
-	}
+	int id;
+
+	Node() { creates++; }
+	Node(int value): value(value) { creates++; }
+	~Node() { destroys++; }
 };
 
@@ -33,11 +32,20 @@
 std::atomic_size_t Node::destroys = { 0 };
 
-static const constexpr int nodes_per_threads = 128;
-struct NodeArray {
-	__attribute__((aligned(64))) Node * array[nodes_per_threads];
-	__attribute__((aligned(64))) char pad;
-};
-
 bool enable_stats = false;
+
+template<>
+thread_local relaxed_list<Node>::TLS relaxed_list<Node>::tls = {};
+
+template<>
+relaxed_list<Node> * relaxed_list<Node>::head = nullptr;
+
+#ifndef NO_STATS
+template<>
+relaxed_list<Node>::GlobalStats relaxed_list<Node>::global_stats = {};
+#endif
+
+// ================================================================================================
+//                        UTILS
+// ================================================================================================
 
 struct local_stat_t {
@@ -47,132 +55,52 @@
 	size_t crc_in  = 0;
 	size_t crc_out = 0;
+	size_t valmax = 0;
+	size_t valmin = 100000000ul;
 };
 
-__attribute__((noinline)) void run_body(
-	std::atomic<bool>& done,
-	Random & rand,
-	Node * (&my_nodes)[128],
-	local_stat_t & local,
-	relaxed_list<Node> & list
-) {
-	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
-		int idx = rand.next() % nodes_per_threads;
-		if (auto node = my_nodes[idx]) {
-			local.crc_in += node->value;
-			list.push(node);
-			my_nodes[idx] = nullptr;
-			local.in++;
-		}
-		else if(auto node = list.pop()) {
-			local.crc_out += node->value;
-			my_nodes[idx] = node;
-			local.out++;
-		}
-		else {
-			local.empty++;
-		}
-	}
-}
-
-void run(unsigned nthread, unsigned nqueues, unsigned fill, double duration) {
-	// List being tested
-	relaxed_list<Node> list = { nthread * nqueues };
-
-	// Barrier for synchronization
-	barrier_t barrier(nthread + 1);
-
-	// Data to check everything is OK
-	struct {
-		std::atomic_size_t in  = { 0 };
-		std::atomic_size_t out = { 0 };
-		std::atomic_size_t empty = { 0 };
-		std::atomic_size_t crc_in  = { 0 };
-		std::atomic_size_t crc_out = { 0 };
-		struct {
-			struct {
-				std::atomic_size_t attempt = { 0 };
-				std::atomic_size_t success = { 0 };
-			} push;
-			struct {
-				std::atomic_size_t attempt = { 0 };
-				std::atomic_size_t success = { 0 };
-			} pop;
-		} pick;
-	} global;
-
-	// Flag to signal termination
-	std::atomic_bool done  = { false };
-
-	// Prep nodes
-	std::cout << "Initializing ";
-	size_t nnodes  = 0;
-	size_t npushed = 0;
-	NodeArray all_nodes[nthread];
-	for(auto & nodes : all_nodes) {
-		Random rand(rdtscl());
-		for(auto & node : nodes.array) {
-			auto r = rand.next() % 100;
-			if(r < fill) {
-				node = new Node(rand.next() % 100);
-				nnodes++;
-			} else {
-				node = nullptr;
-			}
-		}
-
-		for(int i = 0; i < 10; i++) {
-			int idx = rand.next() % nodes_per_threads;
-			if (auto node = nodes.array[idx]) {
-				global.crc_in += node->value;
-				list.push(node);
-				npushed++;
-				nodes.array[idx] = nullptr;
-			}
-		}
-	}
-
-	std::cout << nnodes << " nodes " << fill << "% (" << npushed << " pushed)" << std::endl;
-
-	enable_stats = true;
-
-	std::thread * threads[nthread];
-	unsigned i = 1;
-	for(auto & t : threads) {
-		auto & my_nodes = all_nodes[i - 1].array;
-		t = new std::thread([&done, &list, &barrier, &global, &my_nodes](unsigned tid) {
-			Random rand(tid + rdtscl());
-
-			local_stat_t local;
-
-			// affinity(tid);
-
-			barrier.wait(tid);
-
-			// EXPERIMENT START
-
-			run_body(done, rand, my_nodes, local, list);
-
-			// EXPERIMENT END
-
-			barrier.wait(tid);
-
-			global.in    += local.in;
-			global.out   += local.out;
-			global.empty += local.empty;
-
-			for(auto node : my_nodes) {
-				delete node;
-			}
-
-			global.crc_in  += local.crc_in;
-			global.crc_out += local.crc_out;
-
-			global.pick.push.attempt += relaxed_list<Node>::tls.pick.push.attempt;
-			global.pick.push.success += relaxed_list<Node>::tls.pick.push.success;
-			global.pick.pop .attempt += relaxed_list<Node>::tls.pick.pop.attempt;
-			global.pick.pop .success += relaxed_list<Node>::tls.pick.pop.success;
-		}, i++);
-	}
-
+struct global_stat_t {
+	std::atomic_size_t in  = { 0 };
+	std::atomic_size_t out = { 0 };
+	std::atomic_size_t empty = { 0 };
+	std::atomic_size_t crc_in  = { 0 };
+	std::atomic_size_t crc_out = { 0 };
+	std::atomic_size_t valmax = { 0 };
+	std::atomic_size_t valmin = { 100000000ul };
+};
+
+void atomic_max(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value <= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void atomic_min(std::atomic_size_t & target, size_t value) {
+	for(;;) {
+		size_t expect = target.load(std::memory_order_relaxed);
+		if(value >= expect) return;
+		bool success = target.compare_exchange_strong(expect, value);
+		if(success) return;
+	}
+}
+
+void tally_stats(global_stat_t & global, local_stat_t & local) {
+
+	global.in    += local.in;
+	global.out   += local.out;
+	global.empty += local.empty;
+
+	global.crc_in  += local.crc_in;
+	global.crc_out += local.crc_out;
+
+	atomic_max(global.valmax, local.valmax);
+	atomic_min(global.valmin, local.valmin);
+
+	relaxed_list<Node>::stats_tls_tally();
+}
+
+void waitfor(double & duration, barrier_t & barrier, std::atomic_bool & done) {
 	std::cout << "Starting" << std::endl;
 	auto before = Clock::now();
@@ -196,17 +124,29 @@
 	duration = durr.count();
 	std::cout << "\rClosing down" << std::endl;
-
-	for(auto t : threads) {
-		t->join();
-		delete t;
-	}
-
-	enable_stats = false;
-
-	while(auto node = list.pop()) {
-		global.crc_out += node->value;
-		delete node;
-	}
-
+}
+
+void waitfor(double & duration, barrier_t & barrier, const std::atomic_size_t & count) {
+	std::cout << "Starting" << std::endl;
+	auto before = Clock::now();
+	barrier.wait(0);
+
+	while(true) {
+		usleep(100000);
+		size_t c = count.load();
+		if( c == 0 ) {
+			break;
+		}
+		std::cout << "\r" << c;
+		std::cout.flush();
+	}
+
+	barrier.wait(0);
+	auto after = Clock::now();
+	duration_t durr = after - before;
+	duration = durr.count();
+	std::cout << "\rClosing down" << std::endl;
+}
+
+void print_stats(double duration, unsigned nthread, global_stat_t & global) {
 	assert(Node::creates == Node::destroys);
 	assert(global.crc_in == global.crc_out);
@@ -224,15 +164,373 @@
 	std::cout << "Ops/sec       : " << ops_sec << "\n";
 	std::cout << "Total ops     : " << ops << "(" << global.in << "i, " << global.out << "o, " << global.empty << "e)\n";
+	if(global.valmax != 0) {
+		std::cout << "Max runs      : " << global.valmax << "\n";
+		std::cout << "Min runs      : " << global.valmin << "\n";
+	}
 	#ifndef NO_STATS
-		double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt);
-		double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
-		std::cout << "Push Pick %   : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n";
-		std::cout << "Pop  Pick %   : " << pop_sur  << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n";
+		relaxed_list<Node>::stats_print(std::cout);
 	#endif
 }
 
-void usage(char * argv[]) {
-	std::cerr << argv[0] << ": [DURATION (FLOAT:SEC)] [NTHREADS] [NQUEUES] [FILL]" << std::endl;;
-	std::exit(1);
+void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output);
+
+// ================================================================================================
+//                        EXPERIMENTS
+// ================================================================================================
+
+// ================================================================================================
+__attribute__((noinline)) void runChurn_body(
+	std::atomic<bool>& done,
+	Random & rand,
+	Node * my_nodes[],
+	unsigned nslots,
+	local_stat_t & local,
+	relaxed_list<Node> & list
+) {
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+		int idx = rand.next() % nslots;
+		if (auto node = my_nodes[idx]) {
+			local.crc_in += node->value;
+			list.push(node);
+			my_nodes[idx] = nullptr;
+			local.in++;
+		}
+		else if(auto node = list.pop()) {
+			local.crc_out += node->value;
+			my_nodes[idx] = node;
+			local.out++;
+		}
+		else {
+			local.empty++;
+		}
+	}
+}
+
+void runChurn(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const unsigned nslots) {
+	std::cout << "Churn Benchmark" << std::endl;
+	assert(nnodes <= nslots);
+	// List being tested
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	// Prep nodes
+	std::cout << "Initializing ";
+	size_t npushed = 0;
+	relaxed_list<Node> list = { nthread * nqueues };
+	{
+		Node** all_nodes[nthread];
+		for(auto & nodes : all_nodes) {
+			nodes = new __attribute__((aligned(64))) Node*[nslots + 8];
+			Random rand(rdtscl());
+			for(unsigned i = 0; i < nnodes; i++) {
+				nodes[i] = new Node(rand.next() % 100);
+			}
+
+			for(unsigned i = nnodes; i < nslots; i++) {
+				nodes[i] = nullptr;
+			}
+
+			for(int i = 0; i < 10 && i < (int)nslots; i++) {
+				int idx = rand.next() % nslots;
+				if (auto node = nodes[idx]) {
+					global.crc_in += node->value;
+					list.push(node);
+					npushed++;
+					nodes[idx] = nullptr;
+				}
+			}
+		}
+
+		std::cout << nnodes << " nodes (" << nslots << " slots)" << std::endl;
+
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			auto & my_nodes = all_nodes[i - 1];
+			t = new std::thread([&done, &list, &barrier, &global, &my_nodes, nslots](unsigned tid) {
+				Random rand(tid + rdtscl());
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runChurn_body(done, rand, my_nodes, nslots, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+
+				for(unsigned i = 0; i < nslots; i++) {
+					delete my_nodes[i];
+				}
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+
+		while(auto node = list.pop()) {
+			global.crc_out += node->value;
+			delete node;
+		}
+
+		for(auto nodes : all_nodes) {
+			delete[] nodes;
+		}
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+__attribute__((noinline)) void runPingPong_body(
+	std::atomic<bool>& done,
+	Node initial_nodes[],
+	unsigned nnodes,
+	local_stat_t & local,
+	relaxed_list<Node> & list
+) {
+	Node * nodes[nnodes];
+	{
+		unsigned i = 0;
+		for(auto & n : nodes) {
+			n = &initial_nodes[i++];
+		}
+	}
+
+	while(__builtin_expect(!done.load(std::memory_order_relaxed), true)) {
+
+		for(Node * & node : nodes) {
+			local.crc_in += node->value;
+			list.push(node);
+			local.in++;
+		}
+
+		// -----
+
+		for(Node * & node : nodes) {
+			node = list.pop();
+			assert(node);
+			local.crc_out += node->value;
+			local.out++;
+		}
+	}
+}
+
+void runPingPong(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes) {
+	std::cout << "PingPong Benchmark" << std::endl;
+
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	// Flag to signal termination
+	std::atomic_bool done  = { false };
+
+	std::cout << "Initializing ";
+	// List being tested
+	relaxed_list<Node> list = { nthread * nqueues };
+	{
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&done, &list, &barrier, &global, nnodes](unsigned tid) {
+				Random rand(tid + rdtscl());
+
+				Node nodes[nnodes];
+				for(auto & n : nodes) {
+					n.value = (int)rand.next() % 100;
+				}
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runPingPong_body(done, nodes, nnodes, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, done);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+	}
+
+	print_stats(duration, nthread, global);
+}
+
+// ================================================================================================
+__attribute__((noinline)) void runFairness_body(
+	unsigned tid,
+	size_t width,
+	size_t length,
+	int output[],
+	std::atomic_size_t & count,
+	Node initial_nodes[],
+	unsigned nnodes,
+	local_stat_t & local,
+	relaxed_list<Node> & list
+) {
+	Node * nodes[nnodes];
+	{
+		unsigned i = 0;
+		for(auto & n : nodes) {
+			n = &initial_nodes[i++];
+		}
+	}
+
+	while(__builtin_expect(0 != count.load(std::memory_order_relaxed), true)) {
+
+		for(Node * & node : nodes) {
+			local.crc_in += node->id;
+			list.push(node);
+			local.in++;
+		}
+
+		// -----
+
+		for(Node * & node : nodes) {
+			node = list.pop();
+			assert(node);
+
+			if (unsigned(node->value) < length) {
+				size_t idx = (node->value * width) + node->id;
+				assert(idx < (width * length));
+				output[idx] = tid;
+			}
+
+			node->value++;
+			if(unsigned(node->value) == length) count--;
+
+			local.crc_out += node->id;
+			local.out++;
+		}
+	}
+}
+
+void runFairness(unsigned nthread, unsigned nqueues, double duration, unsigned nnodes, const std::string & output) {
+	std::cout << "Fairness Benchmark, outputing to : " << output << std::endl;
+
+	// Barrier for synchronization
+	barrier_t barrier(nthread + 1);
+
+	// Data to check everything is OK
+	global_stat_t global;
+
+	std::cout << "Initializing ";
+
+	// Check fairness by creating a png of where the threads ran
+	size_t width = nthread * nnodes;
+	size_t length = 100000;
+
+	std::unique_ptr<int[]> data_out { new int[width * length] };
+
+	// Flag to signal termination
+	std::atomic_size_t count = width;
+
+	// List being tested
+	relaxed_list<Node> list = { nthread * nqueues };
+	{
+		enable_stats = true;
+
+		std::thread * threads[nthread];
+		unsigned i = 1;
+		for(auto & t : threads) {
+			t = new std::thread([&count, &list, &barrier, &global, nnodes, width, length, data_out = data_out.get()](unsigned tid) {
+				unsigned int start = (tid - 1) * nnodes;
+				Node nodes[nnodes];
+				for(auto & n : nodes) {
+					n.id = start;
+					n.value = 0;
+					start++;
+				}
+
+				local_stat_t local;
+
+				// affinity(tid);
+
+				barrier.wait(tid);
+
+				// EXPERIMENT START
+
+				runFairness_body(tid, width, length, data_out, count, nodes, nnodes, local, list);
+
+				// EXPERIMENT END
+
+				barrier.wait(tid);
+
+				for(const auto & n : nodes) {
+					local.valmax = max(local.valmax, size_t(n.value));
+					local.valmin = min(local.valmin, size_t(n.value));
+				}
+
+				tally_stats(global, local);
+			}, i++);
+		}
+
+		waitfor(duration, barrier, count);
+
+		for(auto t : threads) {
+			t->join();
+			delete t;
+		}
+
+		enable_stats = false;
+	}
+
+	print_stats(duration, nthread, global);
+
+	save_fairness(data_out.get(), 100, nthread, width, length, output);
+}
+
+// ================================================================================================
+
+bool iequals(const std::string& a, const std::string& b)
+{
+    return std::equal(a.begin(), a.end(),
+                      b.begin(), b.end(),
+                      [](char a, char b) {
+                          return std::tolower(a) == std::tolower(b);
+                      });
 }
 
@@ -241,47 +539,385 @@
 	double duration   = 5.0;
 	unsigned nthreads = 2;
-	unsigned nqueues  = 2;
-	unsigned fill     = 100;
+	unsigned nqueues  = 4;
+	unsigned nnodes   = 100;
+	unsigned nslots   = 100;
+	std::string out   = "fairness.png";
+
+	enum {
+		Churn,
+		PingPong,
+		Fairness,
+		NONE
+	} benchmark = NONE;
 
 	std::cout.imbue(std::locale(""));
 
-	switch (argc)
-	{
+	for(;;) {
+		static struct option options[] = {
+			{"duration",  required_argument, 0, 'd'},
+			{"nthreads",  required_argument, 0, 't'},
+			{"nqueues",   required_argument, 0, 'q'},
+			{"benchmark", required_argument, 0, 'b'},
+			{0, 0, 0, 0}
+		};
+
+		int idx = 0;
+		int opt = getopt_long(argc, argv, "d:t:q:b:", options, &idx);
+
+		std::string arg = optarg ? optarg : "";
+		size_t len = 0;
+		switch(opt) {
+			// Exit Case
+			case -1:
+				/* paranoid */ assert(optind <= argc);
+				switch(benchmark) {
+				case NONE:
+					std::cerr << "Must specify a benchmark" << std::endl;
+					goto usage;
+				case PingPong:
+					nnodes = 1;
+					nslots = 1;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					default:
+						std::cerr << "'PingPong' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+						goto usage;
+					}
+					break;
+				case Churn:
+					nnodes = 100;
+					nslots = 100;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+							nslots = nnodes;
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					case 2:
+						try {
+							arg = optarg = argv[optind];
+							nnodes = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of nodes must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						try {
+							arg = optarg = argv[optind + 1];
+							nslots = stoul(optarg, &len);
+							if(len != arg.size()) { throw std::invalid_argument(""); }
+						} catch(std::invalid_argument &) {
+							std::cerr << "Number of slots must be a positive integer, was " << arg << std::endl;
+							goto usage;
+						}
+						break;
+					default:
+						std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+						goto usage;
+					}
+					break;
+				case Fairness:
+					nnodes = 1;
+					switch(argc - optind) {
+					case 0: break;
+					case 1:
+						arg = optarg = argv[optind];
+						out = arg;
+						break;
+					default:
+						std::cerr << "'Churn' benchmark doesn't accept more than 2 extra arguments" << std::endl;
+						goto usage;
+					}
+				}
+				goto run;
+			// Benchmarks
+			case 'b':
+				if(benchmark != NONE) {
+					std::cerr << "Only when benchmark can be run" << std::endl;
+					goto usage;
+				}
+				if(iequals(arg, "churn")) {
+					benchmark = Churn;
+					break;
+				}
+				if(iequals(arg, "pingpong")) {
+					benchmark = PingPong;
+					break;
+				}
+				if(iequals(arg, "fairness")) {
+					benchmark = Fairness;
+					break;
+				}
+				std::cerr << "Unkown benchmark " << arg << std::endl;
+				goto usage;
+			// Numeric Arguments
+			case 'd':
+				try {
+					duration = stod(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Duration must be a valid double, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			case 't':
+				try {
+					nthreads = stoul(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Number of threads must be a positive integer, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			case 'q':
+				try {
+					nqueues = stoul(optarg, &len);
+					if(len != arg.size()) { throw std::invalid_argument(""); }
+				} catch(std::invalid_argument &) {
+					std::cerr << "Number of queues must be a positive integer, was " << arg << std::endl;
+					goto usage;
+				}
+				break;
+			// Other cases
+			default: /* ? */
+				std::cerr << opt << std::endl;
+			usage:
+				std::cerr << "Usage: " << argv[0] << ": [options] -b churn [NNODES] [NSLOTS = NNODES]" << std::endl;
+				std::cerr << "  or:  " << argv[0] << ": [options] -b pingpong [NNODES]" << std::endl;
+				std::cerr << std::endl;
+				std::cerr << "  -d, --duration=DURATION  Duration of the experiment, in seconds" << std::endl;
+				std::cerr << "  -t, --nthreads=NTHREADS  Number of kernel threads" << std::endl;
+				std::cerr << "  -q, --nqueues=NQUEUES    Number of queues per threads" << std::endl;
+				std::exit(1);
+		}
+	}
+	run:
+
+	check_cache_line_size();
+
+	std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl;
+	switch(benchmark) {
+		case Churn:
+			runChurn(nthreads, nqueues, duration, nnodes, nslots);
+			break;
+		case PingPong:
+			runPingPong(nthreads, nqueues, duration, nnodes);
+			break;
+		case Fairness:
+			runFairness(nthreads, nqueues, duration, nnodes, out);
+			break;
+		default:
+			abort();
+	}
+	return 0;
+}
+
+const char * __my_progname = "Relaxed List";
+
+struct rgb_t {
+    double r;       // a fraction between 0 and 1
+    double g;       // a fraction between 0 and 1
+    double b;       // a fraction between 0 and 1
+};
+
+struct hsv_t {
+    double h;       // angle in degrees
+    double s;       // a fraction between 0 and 1
+    double v;       // a fraction between 0 and 1
+};
+
+rgb_t hsv2rgb(hsv_t in) {
+	double hh, p, q, t, ff;
+	long   i;
+	rgb_t  out;
+
+	if(in.s <= 0.0) {       // < is bogus, just shuts up warnings
+		out.r = in.v;
+		out.g = in.v;
+		out.b = in.v;
+		return out;
+	}
+	hh = in.h;
+	if(hh >= 360.0) hh = 0.0;
+	hh /= 60.0;
+	i = (long)hh;
+	ff = hh - i;
+	p = in.v * (1.0 - in.s);
+	q = in.v * (1.0 - (in.s * ff));
+	t = in.v * (1.0 - (in.s * (1.0 - ff)));
+
+	switch(i) {
+	case 0:
+		out.r = in.v;
+		out.g = t;
+		out.b = p;
+		break;
+	case 1:
+		out.r = q;
+		out.g = in.v;
+		out.b = p;
+		break;
+	case 2:
+		out.r = p;
+		out.g = in.v;
+		out.b = t;
+		break;
+
+	case 3:
+		out.r = p;
+		out.g = q;
+		out.b = in.v;
+		break;
+	case 4:
+		out.r = t;
+		out.g = p;
+		out.b = in.v;
+		break;
 	case 5:
-		fill = std::stoul(argv[4]);
-		[[fallthrough]];
-	case 4:
-		nqueues = std::stoul(argv[3]);
-		[[fallthrough]];
-	case 3:
-		nthreads = std::stoul(argv[2]);
-		[[fallthrough]];
-	case 2:
-		duration = std::stod(argv[1]);
-		if( duration <= 0.0 ) {
-			std::cerr << "Duration must be positive, was " << argv[1] << "(" << duration << ")" << std::endl;
-			usage(argv);
-		}
-		[[fallthrough]];
-	case 1:
+	default:
+		out.r = in.v;
+		out.g = p;
+		out.b = q;
 		break;
-	default:
-		usage(argv);
-		break;
-	}
-
-	check_cache_line_size();
-
-	std::cout << "Running " << nthreads << " threads (" << (nthreads * nqueues) << " queues) for " << duration << " seconds" << std::endl;
-	run(nthreads, nqueues, fill, duration);
-
-	return 0;
-}
-
-template<>
-thread_local relaxed_list<Node>::TLS relaxed_list<Node>::tls = {};
-
-template<>
-relaxed_list<Node>::intrusive_queue_t::stat::Dif relaxed_list<Node>::intrusive_queue_t::stat::dif = {};
-
-const char * __my_progname = "Relaxed List";
+	}
+	return out;
+}
+
+void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
+	std::ofstream os(output);
+	os << "<html>\n";
+	os << "<head>\n";
+	os << "<style>\n";
+	os << "</style>\n";
+	os << "</head>\n";
+	os << "<body>\n";
+	os << "<table style=\"width=100%\">\n";
+
+	size_t idx = 0;
+	for(size_t r = 0ul; r < rows; r++) {
+		os << "<tr>\n";
+		for(size_t c = 0ul; c < columns; c++) {
+			os << "<td class=\"custom custom" << data[idx] << "\"></td>\n";
+			idx++;
+		}
+		os << "</tr>\n";
+	}
+
+	os << "</table>\n";
+	os << "</body>\n";
+	os << "</html>\n";
+	os << std::endl;
+}
+
+#include <png.h>
+#include <setjmp.h>
+
+/*
+void save_fairness(const int data[], int factor, unsigned nthreads, size_t columns, size_t rows, const std::string & output) {
+	int width  = columns * factor;
+	int height = rows / factor;
+
+	int code = 0;
+	int idx = 0;
+	FILE *fp = NULL;
+	png_structp png_ptr = NULL;
+	png_infop info_ptr = NULL;
+	png_bytep row = NULL;
+
+	// Open file for writing (binary mode)
+	fp = fopen(output.c_str(), "wb");
+	if (fp == NULL) {
+		fprintf(stderr, "Could not open file %s for writing\n", output.c_str());
+		code = 1;
+		goto finalise;
+	}
+
+	   // Initialize write structure
+	png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+	if (png_ptr == NULL) {
+		fprintf(stderr, "Could not allocate write struct\n");
+		code = 1;
+		goto finalise;
+	}
+
+	// Initialize info structure
+	info_ptr = png_create_info_struct(png_ptr);
+	if (info_ptr == NULL) {
+		fprintf(stderr, "Could not allocate info struct\n");
+		code = 1;
+		goto finalise;
+	}
+
+	// Setup Exception handling
+	if (setjmp(png_jmpbuf(png_ptr))) {
+		fprintf(stderr, "Error during png creation\n");
+		code = 1;
+		goto finalise;
+	}
+
+	png_init_io(png_ptr, fp);
+
+	// Write header (8 bit colour depth)
+	png_set_IHDR(png_ptr, info_ptr, width, height,
+		8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
+		PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
+
+	png_write_info(png_ptr, info_ptr);
+
+	// Allocate memory for one row (3 bytes per pixel - RGB)
+	row = (png_bytep) malloc(3 * width * sizeof(png_byte));
+
+	// Write image data
+	int x, y;
+	for (y=0 ; y<height ; y++) {
+		for (x=0 ; x<width ; x++) {
+			auto & r = row[(x * 3) + 0];
+			auto & g = row[(x * 3) + 1];
+			auto & b = row[(x * 3) + 2];
+			assert(idx < (rows * columns));
+			int color = data[idx] - 1;
+			assert(color < nthreads);
+			assert(color >= 0);
+			idx++;
+
+			double angle = double(color) / double(nthreads);
+
+			auto c = hsv2rgb({ 360.0 * angle, 0.8, 0.8 });
+
+			r = char(c.r * 255.0);
+			g = char(c.g * 255.0);
+			b = char(c.b * 255.0);
+
+		}
+		png_write_row(png_ptr, row);
+	}
+
+	assert(idx == (rows * columns));
+
+	// End write
+	png_write_end(png_ptr, NULL);
+
+	finalise:
+	if (fp != NULL) fclose(fp);
+	if (info_ptr != NULL) png_free_data(png_ptr, info_ptr, PNG_FREE_ALL, -1);
+	if (png_ptr != NULL) png_destroy_write_struct(&png_ptr, (png_infopp)NULL);
+	if (row != NULL) free(row);
+}
+*/
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision 92a97685f5aabd9745b94f924132f4ef5f9d6868)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -37,4 +37,35 @@
 };
 
+static inline bool bts(std::atomic_size_t & target, size_t bit ) {
+	//*
+	int result = 0;
+	asm volatile(
+		"LOCK btsq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_or(mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
+
+static inline bool btr(std::atomic_size_t & target, size_t bit ) {
+	//*
+	int result = 0;
+	asm volatile(
+		"LOCK btrq %[bit], %[target]\n\t"
+		:"=@ccc" (result)
+		: [target] "m" (target), [bit] "r" (bit)
+	);
+ 	return result != 0;
+	/*/
+	size_t mask = 1ul << bit;
+	size_t ret = target.fetch_and(~mask, std::memory_order_relaxed);
+	return (ret & mask) != 0;
+	//*/
+}
 
 extern bool enable_stats;
@@ -48,4 +79,16 @@
 		size_t attempt = 0;
 		size_t success = 0;
+		size_t mask_attempt = 0;
+	} pop;
+};
+
+struct empty_stat {
+	struct {
+		size_t value = 0;
+		size_t count = 0;
+	} push;
+	struct {
+		size_t value = 0;
+		size_t count = 0;
 	} pop;
 };
@@ -62,19 +105,22 @@
 	static_assert(std::is_same<decltype(node_t::_links), _LinksFields_t<node_t>>::value, "Node must have a links field");
 
-
 public:
 	relaxed_list(unsigned numLists)
-	  	: numNonEmpty{0}
-		, lists(new intrusive_queue_t[numLists])
+	  	: lists(new intrusive_queue_t[numLists])
 		, numLists(numLists)
-	{}
+	{
+		assertf(7 * 8 * 8 >= numLists, "List currently only supports 448 sublists");
+		// assert(sizeof(*this) == 128);
+		std::cout << "Constructing Relaxed List with " << numLists << std::endl;
+
+		#ifndef NO_STATS
+			if(head) this->next = head;
+			head = this;
+		#endif
+	}
 
 	~relaxed_list() {
+		std::cout << "Destroying Relaxed List" << std::endl;
 		lists.reset();
-		#ifndef NO_STATS
-			std::cout << "Difference   : "
-				<< ssize_t(double(intrusive_queue_t::stat::dif.value) / intrusive_queue_t::stat::dif.num  ) << " avg\t"
-				<< intrusive_queue_t::stat::dif.max << "max" << std::endl;
-		#endif
 	}
 
@@ -84,5 +130,5 @@
 		while(true) {
 			// Pick a random list
-			int i = tls.rng.next() % numLists;
+			unsigned i = tls.rng.next() % numLists;
 
 			#ifndef NO_STATS
@@ -93,6 +139,16 @@
 			if( !lists[i].lock.try_lock() ) continue;
 
+			__attribute__((unused)) int num = numNonEmpty;
+
 			// Actually push it
-			lists[i].push(node, numNonEmpty);
+			if(lists[i].push(node)) {
+				numNonEmpty++;
+				size_t qword = i >> 6ull;
+				size_t bit   = i & 63ull;
+				assertf((list_mask[qword] & (1ul << bit)) == 0, "Before set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+				__attribute__((unused)) bool ret = bts(list_mask[qword], bit);
+				assert(!ret);
+				assertf((list_mask[qword] & (1ul << bit)) != 0, "After set %zu:%zu (%u), %zx & %zx", qword, bit, i, list_mask[qword].load(), (1ul << bit));
+			}
 			assert(numNonEmpty <= (int)numLists);
 
@@ -102,4 +158,6 @@
 			#ifndef NO_STATS
 				tls.pick.push.success++;
+				tls.empty.push.value += num;
+				tls.empty.push.count += 1;
 			#endif
 			return;
@@ -108,47 +166,121 @@
 
 	__attribute__((noinline, hot)) node_t * pop() {
-		while(numNonEmpty != 0) {
-			// Pick two lists at random
-			int i = tls.rng.next() % numLists;
-			int j = tls.rng.next() % numLists;
-
-			#ifndef NO_STATS
-				tls.pick.pop.attempt++;
-			#endif
-
-			// Pick the bet list
-			int w = i;
-			if( __builtin_expect(lists[j].ts() != 0, true) ) {
-				w = (lists[i].ts() < lists[j].ts()) ? i : j;
-			}
-
-			auto & list = lists[w];
-			// If list looks empty retry
-			if( list.ts() == 0 ) continue;
-
-			// If we can't get the lock retry
-			if( !list.lock.try_lock() ) continue;
-
-			// If list is empty, unlock and retry
-			if( list.ts() == 0 ) {
-				list.lock.unlock();
-				continue;
-			}
-
-			// Actually pop the list
-			auto node = list.pop(numNonEmpty);
-			assert(node);
-
-			// Unlock and return
-			list.lock.unlock();
-			assert(numNonEmpty >= 0);
-			#ifndef NO_STATS
-				tls.pick.pop.success++;
-			#endif
-			return node;
-		}
+		#if !defined(NO_BITMASK)
+			// for(int r = 0; r < 10 && numNonEmpty != 0; r++) {
+			// 	// Pick two lists at random
+			// 	unsigned i = tls.rng.next() % numLists;
+			// 	unsigned j = tls.rng.next() % numLists;
+
+			// 	if(auto node = try_pop(i, j)) return node;
+			// }
+			int nnempty;
+			while(0 != (nnempty = numNonEmpty)) {
+				tls.pick.pop.mask_attempt++;
+				unsigned i, j;
+				// if( numLists < 4 || (numLists / nnempty) < 4 ) {
+				// 	// Pick two lists at random
+				// 	i = tls.rng.next() % numLists;
+				// 	j = tls.rng.next() % numLists;
+				// } else
+				{
+					#ifndef NO_STATS
+						// tls.pick.push.mask_attempt++;
+					#endif
+
+					// Pick two lists at random
+					unsigned num = ((numLists - 1) >> 6) + 1;
+
+					unsigned ri = tls.rng.next();
+					unsigned rj = tls.rng.next();
+
+					unsigned wdxi = (ri >> 6u) % num;
+					unsigned wdxj = (rj >> 6u) % num;
+
+					size_t maski = list_mask[wdxi].load(std::memory_order_relaxed);
+					size_t maskj = list_mask[wdxj].load(std::memory_order_relaxed);
+
+					if(maski == 0 && maskj == 0) continue;
+
+					unsigned bi = rand_bit(ri, maski);
+					unsigned bj = rand_bit(rj, maskj);
+
+					assertf(bi < 64, "%zu %u", maski, bi);
+					assertf(bj < 64, "%zu %u", maskj, bj);
+
+					i = bi | (wdxi << 6);
+					j = bj | (wdxj << 6);
+
+					assertf(i < numLists, "%u", wdxi << 6);
+					assertf(j < numLists, "%u", wdxj << 6);
+				}
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+		#else
+			while(numNonEmpty != 0) {
+				// Pick two lists at random
+				int i = tls.rng.next() % numLists;
+				int j = tls.rng.next() % numLists;
+
+				if(auto node = try_pop(i, j)) return node;
+			}
+		#endif
 
 		return nullptr;
     	}
+
+private:
+	node_t * try_pop(unsigned i, unsigned j) {
+		#ifndef NO_STATS
+			tls.pick.pop.attempt++;
+		#endif
+
+		// Pick the bet list
+		int w = i;
+		if( __builtin_expect(lists[j].ts() != 0, true) ) {
+			w = (lists[i].ts() < lists[j].ts()) ? i : j;
+		}
+
+		auto & list = lists[w];
+		// If list looks empty retry
+		if( list.ts() == 0 ) return nullptr;
+
+		// If we can't get the lock retry
+		if( !list.lock.try_lock() ) return nullptr;
+
+		__attribute__((unused)) int num = numNonEmpty;
+
+		// If list is empty, unlock and retry
+		if( list.ts() == 0 ) {
+			list.lock.unlock();
+			return nullptr;
+		}
+
+		// Actually pop the list
+		node_t * node;
+		bool emptied;
+		std::tie(node, emptied) = list.pop();
+		assert(node);
+
+		if(emptied) {
+			numNonEmpty--;
+			size_t qword = w >> 6ull;
+			size_t bit   = w & 63ull;
+			assert((list_mask[qword] & (1ul << bit)) != 0);
+			__attribute__((unused)) bool ret = btr(list_mask[qword], bit);
+			assert(ret);
+			assert((list_mask[qword] & (1ul << bit)) == 0);
+		}
+
+		// Unlock and return
+		list.lock.unlock();
+		assert(numNonEmpty >= 0);
+		#ifndef NO_STATS
+			tls.pick.pop.success++;
+			tls.empty.pop.value += num;
+			tls.empty.pop.count += 1;
+		#endif
+		return node;
+	}
 
 private:
@@ -162,10 +294,8 @@
 		struct stat {
 			ssize_t diff = 0;
-
-			static struct Dif {
-				ssize_t value = 0;
-				size_t  num   = 0;
-				ssize_t max   = 0;
-			} dif;
+			size_t  push = 0;
+			size_t  pop  = 0;
+			// size_t value = 0;
+			// size_t count = 0;
 		};
 
@@ -178,7 +308,12 @@
 		sentinel_t before;
 		sentinel_t after;
-		stat s;
-
+		#ifndef NO_STATS
+			stat s;
+		#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winvalid-offsetof"
 		static constexpr auto fields_offset = offsetof( node_t, _links );
+#pragma GCC diagnostic pop
 	public:
 		intrusive_queue_t()
@@ -186,21 +321,15 @@
 			, after {{ head(), nullptr }}
 		{
-			assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));
-			assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));
-			assert(head()->_links.prev == nullptr);
-			assert(head()->_links.next == tail() );
-			assert(tail()->_links.next == nullptr);
-			assert(tail()->_links.prev == head() );
-			assert(sizeof(*this) == 128);
-			assert((intptr_t(this) % 128) == 0);
-		}
-
-		~intrusive_queue_t() {
-			#ifndef NO_STATS
-				stat::dif.value+= s.diff;
-				stat::dif.num  ++;
-				stat::dif.max  = std::abs(stat::dif.max) > std::abs(s.diff) ? stat::dif.max : s.diff;
-			#endif
-		}
+			/* paranoid */ assert((reinterpret_cast<uintptr_t>( head() ) + fields_offset) == reinterpret_cast<uintptr_t>(&before));
+			/* paranoid */ assert((reinterpret_cast<uintptr_t>( tail() ) + fields_offset) == reinterpret_cast<uintptr_t>(&after ));
+			/* paranoid */ assert(head()->_links.prev == nullptr);
+			/* paranoid */ assert(head()->_links.next == tail() );
+			/* paranoid */ assert(tail()->_links.next == nullptr);
+			/* paranoid */ assert(tail()->_links.prev == head() );
+			/* paranoid */ assert(sizeof(*this) == 128);
+			/* paranoid */ assert((intptr_t(this) % 128) == 0);
+		}
+
+		~intrusive_queue_t() = default;
 
 		inline node_t * head() const {
@@ -220,5 +349,5 @@
 		}
 
-		inline void push(node_t * node, std::atomic_int & nonEmpty) {
+		inline bool push(node_t * node) {
 			assert(lock);
 			assert(node->_links.ts != 0);
@@ -232,14 +361,19 @@
 			prev->_links.next = node;
 			tail->_links.prev = node;
+			#ifndef NO_STATS
+				if(enable_stats) {
+					s.diff++;
+					s.push++;
+				}
+			#endif
 			if(before._links.ts == 0l) {
-				nonEmpty += 1;
 				before._links.ts = node->_links.ts;
-			}
-			#ifndef NO_STATS
-				if(enable_stats) s.diff++;
-			#endif
-		}
-
-		inline node_t * pop(std::atomic_int & nonEmpty) {
+				assert(node->_links.prev == this->head());
+				return true;
+			}
+			return false;
+		}
+
+		inline std::pair<node_t *, bool> pop() {
 			assert(lock);
 			node_t * head = this->head();
@@ -248,12 +382,18 @@
 			node_t * node = head->_links.next;
 			node_t * next = node->_links.next;
-			if(node == tail) return nullptr;
+			if(node == tail) return {nullptr, false};
 
 			head->_links.next = next;
 			next->_links.prev = head;
 
+			#ifndef NO_STATS
+				if(enable_stats) {
+					s.diff--;
+					s.pop ++;
+				}
+			#endif
 			if(next == tail) {
 				before._links.ts = 0l;
-				nonEmpty -= 1;
+				return {node, true};
 			}
 			else {
@@ -261,9 +401,6 @@
 				before._links.ts = next->_links.ts;
 				assert(before._links.ts != 0);
-			}
-			#ifndef NO_STATS
-				if(enable_stats) s.diff--;
-			#endif
-			return node;
+				return {node, false};
+			}
 		}
 
@@ -277,10 +414,13 @@
 
 	static __attribute__((aligned(128))) thread_local struct TLS {
-		Random    rng = { int(rdtscl()) };
-		pick_stat pick;
+		Random     rng = { int(rdtscl()) };
+		pick_stat  pick;
+		empty_stat empty;
 	} tls;
 
+public:
+	std::atomic_int numNonEmpty  = { 0 };  // number of non-empty lists
+	std::atomic_size_t list_mask[7] = { {0}, {0}, {0}, {0}, {0}, {0}, {0} }; // which queues are empty
 private:
-	std::atomic_int numNonEmpty; // number of non-empty lists
     	__attribute__((aligned(64))) std::unique_ptr<intrusive_queue_t []> lists;
 	const unsigned numLists;
@@ -288,3 +428,92 @@
 public:
 	static const constexpr size_t sizeof_queue = sizeof(intrusive_queue_t);
+
+#ifndef NO_STATS
+	static void stats_print(std::ostream & os) {
+		auto it = head;
+		while(it) {
+			it->stats_print_local(os);
+			it = it->next;
+		}
+	}
+
+	static void stats_tls_tally() {
+		global_stats.pick.push.attempt += tls.pick.push.attempt;
+		global_stats.pick.push.success += tls.pick.push.success;
+		global_stats.pick.pop .attempt += tls.pick.pop.attempt;
+		global_stats.pick.pop .success += tls.pick.pop.success;
+		global_stats.pick.pop .mask_attempt += tls.pick.pop.mask_attempt;
+
+		global_stats.qstat.push.value += tls.empty.push.value;
+		global_stats.qstat.push.count += tls.empty.push.count;
+		global_stats.qstat.pop .value += tls.empty.pop .value;
+		global_stats.qstat.pop .count += tls.empty.pop .count;
+	}
+
+private:
+	static struct GlobalStats {
+		struct {
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t success = { 0 };
+			} push;
+			struct {
+				std::atomic_size_t attempt = { 0 };
+				std::atomic_size_t success = { 0 };
+				std::atomic_size_t mask_attempt = { 0 };
+			} pop;
+		} pick;
+		struct {
+			struct {
+				std::atomic_size_t value = { 0 };
+				std::atomic_size_t count = { 0 };
+			} push;
+			struct {
+				std::atomic_size_t value = { 0 };
+				std::atomic_size_t count = { 0 };
+			} pop;
+		} qstat;
+	} global_stats;
+
+	// Link list of all lists for stats
+	__attribute__((aligned(64))) relaxed_list<node_t> * next = nullptr;
+
+	static relaxed_list<node_t> * head;
+
+	void stats_print_local(std::ostream & os ) {
+		std::cout << "----- Relaxed List Stats -----" << std::endl;
+		{
+			ssize_t diff = 0;
+			size_t  num  = 0;
+			ssize_t max  = 0;
+
+			for(size_t i = 0; i < numLists; i++) {
+				const auto & list = lists[i];
+				diff+= list.s.diff;
+				num ++;
+				max  = std::abs(max) > std::abs(list.s.diff) ? max : list.s.diff;
+				os << "Local Q ops   : " << (list.s.push + list.s.pop) << "(" << list.s.push << "i, " << list.s.pop << "o)\n";
+			}
+
+			os << "Difference   : " << ssize_t(double(diff) / num  ) << " avg\t" << max << "max" << std::endl;
+		}
+
+		const auto & global = global_stats;
+
+		double push_sur = (100.0 * double(global.pick.push.success) / global.pick.push.attempt);
+		double pop_sur  = (100.0 * double(global.pick.pop .success) / global.pick.pop .attempt);
+		double mpop_sur = (100.0 * double(global.pick.pop .success) / global.pick.pop .mask_attempt);
+
+		os << "Push   Pick % : " << push_sur << "(" << global.pick.push.success << " / " << global.pick.push.attempt << ")\n";
+		os << "Pop    Pick % : " << pop_sur  << "(" << global.pick.pop .success << " / " << global.pick.pop .attempt << ")\n";
+		os << "TryPop Pick % : " << mpop_sur << "(" << global.pick.pop .success << " / " << global.pick.pop .mask_attempt << ")\n";
+
+		double avgQ_push = double(global.qstat.push.value) / global.qstat.push.count;
+		double avgQ_pop  = double(global.qstat.pop .value) / global.qstat.pop .count;
+		double avgQ      = double(global.qstat.push.value + global.qstat.pop .value) / (global.qstat.push.count + global.qstat.pop .count);
+		os << "Push   Avg Qs : " << avgQ_push << " (" << global.qstat.push.count << "ops)\n";
+		os << "Pop    Avg Qs : " << avgQ_pop  << " (" << global.qstat.pop .count << "ops)\n";
+		os << "Global Avg Qs : " << avgQ      << " (" << (global.qstat.push.count + global.qstat.pop .count) << "ops)\n";
+	}
+#endif
 };
Index: doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,23 @@
+#define NO_IO
+#define NDEBUG
+#include "relaxed_list.hpp"
+
+struct __attribute__((aligned(64))) Node {
+	static std::atomic_size_t creates;
+	static std::atomic_size_t destroys;
+
+	_LinksFields_t<Node> _links;
+
+	int value;
+	Node(int value): value(value) {
+		creates++;
+	}
+
+	~Node() {
+		destroys++;
+	}
+};
+
+int main() {
+	return sizeof(relaxed_list<Node>) + relaxed_list<Node>::sizeof_queue;
+}
Index: doc/theses/thierry_delisle_PhD/code/scale.sh
===================================================================
--- doc/theses/thierry_delisle_PhD/code/scale.sh	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/code/scale.sh	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,7 @@
+#!/bin/bash
+taskset -c 24-31 ./a.out -t  1 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  2 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  4 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 24-31 ./a.out -t  8 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c 16-31 ./a.out -t 16 -b churn | grep --color -E "(ns|Ops|Running)"
+taskset -c  0-31 ./a.out -t 32 -b churn | grep --color -E "(ns|Ops|Running)"
Index: doc/theses/thierry_delisle_PhD/code/utils.hpp
===================================================================
--- doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision 92a97685f5aabd9745b94f924132f4ef5f9d6868)
+++ doc/theses/thierry_delisle_PhD/code/utils.hpp	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -10,4 +10,6 @@
 #include <unistd.h>
 #include <sys/sysinfo.h>
+
+#include <x86intrin.h>
 
 // Barrier from
@@ -56,5 +58,5 @@
 }
 
-void affinity(int tid) {
+static inline void affinity(int tid) {
 	static int cpus = get_nprocs();
 
@@ -70,5 +72,5 @@
 
 static const constexpr std::size_t cache_line_size = 64;
-void check_cache_line_size() {
+static inline void check_cache_line_size() {
 	std::cout << "Checking cache line size" << std::endl;
 	const std::string cache_file = "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size";
@@ -103,2 +105,40 @@
 	return std::chrono::duration_cast<std::chrono::duration<T, Ratio>>(std::chrono::duration<T>(seconds)).count();
 }
+
+static inline unsigned rand_bit(unsigned rnum, size_t mask) {
+	unsigned bit = mask ? rnum % __builtin_popcountl(mask) : 0;
+#if !defined(__BMI2__)
+	uint64_t v = mask;   // Input value to find position with rank r.
+	unsigned int r = bit + 1;// Input: bit's desired rank [1-64].
+	unsigned int s;      // Output: Resulting position of bit with rank r [1-64]
+	uint64_t a, b, c, d; // Intermediate temporaries for bit count.
+	unsigned int t;      // Bit count temporary.
+
+	// Do a normal parallel bit count for a 64-bit integer,
+	// but store all intermediate steps.
+	a =  v - ((v >> 1) & ~0UL/3);
+	b = (a & ~0UL/5) + ((a >> 2) & ~0UL/5);
+	c = (b + (b >> 4)) & ~0UL/0x11;
+	d = (c + (c >> 8)) & ~0UL/0x101;
+
+
+	t = (d >> 32) + (d >> 48);
+	// Now do branchless select!
+	s  = 64;
+	s -= ((t - r) & 256) >> 3; r -= (t & ((t - r) >> 8));
+	t  = (d >> (s - 16)) & 0xff;
+	s -= ((t - r) & 256) >> 4; r -= (t & ((t - r) >> 8));
+	t  = (c >> (s - 8)) & 0xf;
+	s -= ((t - r) & 256) >> 5; r -= (t & ((t - r) >> 8));
+	t  = (b >> (s - 4)) & 0x7;
+	s -= ((t - r) & 256) >> 6; r -= (t & ((t - r) >> 8));
+	t  = (a >> (s - 2)) & 0x3;
+	s -= ((t - r) & 256) >> 7; r -= (t & ((t - r) >> 8));
+	t  = (v >> (s - 1)) & 0x1;
+	s -= ((t - r) & 256) >> 8;
+	return s - 1;
+#else
+	uint64_t picked = _pdep_u64(1ul << bit, mask);
+	return picked ? __builtin_ctzl(picked) : 0;
+#endif
+}
Index: doc/theses/thierry_delisle_PhD/comp_II/Makefile
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/Makefile	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/comp_II/Makefile	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,84 @@
+## Define the configuration variables.
+
+Build = build
+Figures = figures
+Macros = ../../../LaTeXmacros
+TeXLIB = .:${Macros}:${Build}:../../../bibliography:
+LaTeX  = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error -output-directory=${Build}
+BibTeX = BIBINPUTS=${TeXLIB} && export BIBINPUTS && bibtex
+
+MAKEFLAGS = --no-print-directory --silent #
+VPATH = ${Build} ${Figures}
+
+## Define the text source files.
+
+SOURCES = ${addsuffix .tex, \
+comp_II \
+}
+
+FIGURES = ${addsuffix .tex, \
+}
+
+PICTURES = ${addsuffix .pstex, \
+}
+
+PROGRAMS = ${addsuffix .tex, \
+}
+
+GRAPHS = ${addsuffix .tex, \
+}
+
+## Define the documents that need to be made.
+
+DOCUMENT = comp_II.pdf
+BASE = ${basename ${DOCUMENT}}
+
+# Directives #
+
+.PHONY : all clean					# not file names
+
+all : ${DOCUMENT}
+
+clean :
+	@rm -frv ${DOCUMENT} ${BASE}.ps ${Build}
+
+# File Dependencies #
+
+${DOCUMENT} : ${BASE}.ps
+	ps2pdf $<
+
+${BASE}.ps : ${BASE}.dvi
+	dvips ${Build}/$< -o $@
+
+${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \
+		${Macros}/common.tex ${Macros}/indexstyle ../../../bibliography/pl.bib \
+		local.bib glossary.tex | ${Build}
+	# Must have *.aux file containing citations for bibtex
+	if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi
+	-${BibTeX} ${Build}/${basename $@}
+	# Some citations reference others so run again to resolve these citations
+	${LaTeX} ${basename $@}.tex
+	-${BibTeX} ${Build}/${basename $@}
+	# Make index from *.aux entries and input index at end of document
+	makeglossaries -q -s ${Build}/${basename $@}.ist ${Build}/${basename $@}
+	# Run again to finish citations
+	${LaTeX} ${basename $@}.tex
+
+## Define the default recipes.
+
+${Build}:
+	mkdir -p ${Build}
+
+%.tex : %.fig ${Build}
+	fig2dev -L eepic $< > ${Build}/$@
+
+%.ps : %.fig | ${Build}
+	fig2dev -L ps $< > ${Build}/$@
+
+%.pstex : %.fig | ${Build}
+	fig2dev -L pstex $< > ${Build}/$@
+	fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t
+
+# Local Variables: #
+# compile-command: "make" #
+# End: #
Index: doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,191 @@
+\documentclass[11pt,fullpage]{article}
+\usepackage[T1]{fontenc}
+\usepackage[utf8]{inputenc}
+\usepackage{listings}		% for code listings
+\usepackage{xspace}
+\usepackage{xcolor}
+\usepackage{graphicx}
+\usepackage[hidelinks]{hyperref}
+\usepackage{glossaries}
+\usepackage{textcomp}
+\usepackage{geometry}
+
+% cfa macros used in the document
+\input{common}
+\input{glossary}
+
+\CFAStyle				% use default CFA format-style
+
+\title{
+	\Huge \vspace*{1in} The \CFA Scheduler\\
+	\huge \vspace*{0.25in} PhD Comprehensive II Research Proposal
+	\vspace*{1in}
+}
+
+\author{
+	\huge Thierry Delisle \\
+	\Large \vspace*{0.1in} \texttt{tdelisle@uwaterloo.ca} \\
+	\Large Cheriton School of Computer Science \\
+	\Large University of Waterloo
+}
+
+\date{
+	\today
+}
+
+\begin{document}
+\maketitle
+\cleardoublepage
+
+\newcommand{\cit}{\textsuperscript{[Citation Needed]}\xspace}
+\newcommand{\TODO}{~\newline{\large\bf\color{red} TODO :}\xspace}
+
+% ===============================================================================
+% ===============================================================================
+
+\tableofcontents
+
+% ===============================================================================
+% ===============================================================================
+\newpage
+\section{Introduction}
+\subsection{\CFA and the \CFA concurrency package}
+\CFA\cit is a modern, polymorphic, non-object-oriented, backwards-compatible extension of the C programming language. It aims to add high productivity features while maintaning the predictible performance of C. As such concurrency in \CFA\cit aims to offer simple and safe high-level tools while still allowing performant code. Concurrent code is written in the syncrhonous programming paradigm but uses \glspl{uthrd} in order to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programing. As such the \CFA scheduler is a user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}.
+
+The goal of this research is to produce a scheduler that is simple to use and offers acceptable performance in all cases. Here simplicity does not refer to the API but to how much scheduling concerns programmers need to take into account when using the \CFA concurrency package. Therefore, the main goal of this proposal is as follows :
+\begin{quote}
+The \CFA scheduler should be \emph{viable} for any workload.
+\end{quote}
+
+This objective includes producing a scheduling strategy with minimal fairness guarantees, creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily and hide blocking I/O operations and, writing sufficient library tools to allow developpers to properly use the scheduler.
+
+% ===============================================================================
+% ===============================================================================
+
+\section{Scheduling for \CFA}
+While the \CFA concurrency package doesn't have any particular scheduling needs beyond those of any concurrency package which uses \glspl{uthrd}, it is important that the default \CFA Scheduler be viable in general. Indeed, since the \CFA Scheduler does not target any specific workloads, it is unrealistic to demand that it use the best scheduling strategy in all cases. However, it should offer a viable ``out of the box'' solution for most scheduling problems so that programmers can quickly write performant concurrent without needed to think about which scheduling strategy is more appropriate for their workload. Indeed, only programmers with exceptionnaly high performance requirements should need to write their own scheduler. More specifically, two broad types of schedulering strategies should be avoided in order to avoid penalizing certain types of workloads : feedback-based and priority schedulers.
+
+\subsection{Feedback-Based Schedulers}
+Many operating systems use schedulers based on feadback loops in some form, they measure how much CPU a particular thread has used\footnote{Different metrics can be used to here but it is not relevant to the discussion.} and schedule threads based on this metric. These strategies are sensible for operating systems but rely on two assumptions on the workload :
+
+\begin{enumerate}
+	\item Threads live long enough to be scheduled many times.
+	\item Cooperation among all threads is not simply infeasible, it is a security risk.
+\end{enumerate}
+
+While these two assumptions generally hold for operating systems, they may not for \CFA programs. In fact, \CFA uses \glspl{uthrd} which have the explicit goal of reducing the cost of threading primitives to allow many smaller threads. This can naturally lead to have threads with much shorter lifetime and only being scheduled a few times. Scheduling strategies based on feadback loops cannot be effective in these cases because they will not have the opportunity to measure the metrics that underlay the algorithm. Note that the problem of feadback loop convergence (reacting too slowly to scheduling events) is not specific to short lived threads but can also occur with threads that show drastic changes in scheduling event, e.g., threads running for long periods of time and then suddenly blocking and unblocking quickly and repeatedly.
+
+In the context of operating systems, these concerns can be overshadowed by a more pressing concern : security. When multiple users are involved, it is possible that some users are malevolent and try to exploit the scheduling strategy in order to achieve some nefarious objective. Security concerns mean that more precise and robust fairness metrics must be used. In the case of the \CFA scheduler, every thread runs in the same user-space and are controlled from the same user. It is then possible to safely ignore the possibility that threads are malevolent and assume that all threads will ignore or cooperate with each other. This allows for a much simpler fairness metric and in this proposal ``fairness'' will be considered as equal opportunities to run once scheduled.
+
+Since feadback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not user per-threads feedback. Feedback loops in general are not rejected for secondary concerns like idle sleep, but no feedback loop is used to decide which thread to run next.
+
+\subsection{Priority Schedulers}
+Another broad category of schedulers are priority schedulers. In these scheduling strategies threads have priorities and the runtime schedules the threads with the highest priority before scheduling other threads. Threads with equal priority are scheduled using a secondary strategy, often something simple like round-robin or FIFO. These priority mean that, as long as there is a thread with a higher priority that desires to run, a thread with a lower priority will not run. This possible starving of threads can dramatically increase programming complexity since starving threads and priority inversion (prioritising a lower priority thread) can both lead to serious problems, leaving programmers between a rock and a hard place.
+
+An important observation to make is that threads do not need to have explicit priorities for problems to be possible. Indeed, any system with multiple ready-queues and attempts to exhaust one queue before accessing the other queues, could encounter starvation problems. A popular scheduling strategy that suffers from implicit priorities is work-stealing. Work-stealing is generally presented as follows :
+
+\begin{itemize}
+	\item Each processor has a list of threads.
+\end{itemize}
+\begin{enumerate}
+	\item Run threads from ``this'' processor's list.
+	\item If ``this'' processor's list is empty, run threads from some other processor's list.
+\end{enumerate}
+
+In a loaded system\footnote{A loaded system is a system where threads are being run at the same rate they are scheduled}, if a thread does not yield or block for an extended period of time, threads on the same processor list will starve if no other processors can exhaust their list.
+
+Since priorities can be complex to handle for programmers, the scheduling strategy proposed for the \CFA runtime does not use a strategy with either implicit or explicit thread priorities.
+
+\subsection{Schedulers without feadback or priorities}
+I claim that the ideal default scheduler for the \CFA runtime is a scheduler that offers good scalability and a simple fairness guarantee that is easy for programmers to reason about. The simplest fairness guarantee is to guarantee FIFO ordering, i.e., threads scheduled first will run first. However, enforcing FIFO ordering generally conflicts with scalability across multiple processors because of the additionnal synchronization. Thankfully, strict FIFO is not needed for scheduling. Since concurrency is inherently non-deterministic, fairness concerns in scheduling are only a problem if a thread repeatedly runs before another thread can run\footnote{This is because the non-determinism means that programmers must already handle ordering problems in order to produce correct code and already must rely on weak guarantees, for example that a specific thread will \emph{eventually} run.}. This need for unfairness to persist before problems occur means that the FIFO fairness guarantee can be significantly relaxed without causing problems. For this proposal, the target guarantee is that the \CFA scheduler guarantees \emph{probable} FIFO ordering, which is defined as follows :
+\begin{itemize}
+	\item Given two threads $X$ and $Y$, the odds that thread $X$ runs $N$ times \emph{after} thread $Y$ is scheduled but \emph{before} it is run, decreases exponentially with regards to $N$.
+\end{itemize}
+
+While this is not a strong guarantee, the probability that problems persist for long period of times decreases exponentially, making persisting problems virtually impossible.
+
+\subsection{Real-Time}
+While the objective of this proposed scheduler is similar to the objective of real-time scheduling, this proposal is not a proposal for real-time scheduler and as such makes no attempt to offer either soft or hard guarantees on scheduling delays.
+
+% ===============================================================================
+% ===============================================================================
+\section{Proposal}
+
+\subsection{Ready-Queue}
+Using trevor's paper\cit as basis, it is simple to build a relaxed FIFO list that is fast and scalable for loaded or overloaded systems. The described queue uses an array of underlying strictly FIFO queue. Pushing new data is done by selecting one of these underlying queues at random, recording a timestamp for the push and pushing to the selected queue. Popping is done by selecting two queues at random and popping from the queue for which the head has the oldest timestamp. In loaded or overloaded systems, it is higly likely that the queues is far from empty, e.i., several tasks are on each of the underlying queues. This means that selecting a queue at random to pop from is higly likely to yield a queue that is not empty.
+
+When the ready queue is "more empty", i.e., several of the inner queues are empty, selecting a random queue for popping is less likely to yield a valid selection and more attempts need to be made, resulting in a performance degradation. In cases, with few elements on the ready queue and few processors running, performance can be improved by adding information to help processors find which inner queues are used. Preliminary performance tests indicate that with few processors, a bitmask can be used to identify which inner queues are currently in use. This is especially effective in the single-thread case, where the bitmask will always be up-to-date. Furthermore, modern x86 CPUs have a BMI2 extension which allow using the bitmask with very little overhead over directly accessing the readyqueue offerring decent performance even in cases with many empty inner queues. This technique does not solve the problem completely, it randomly attempts to find a block of 64 queues where at least one is used, instead of attempting to find a used queue. For systems with a large number of cores this does not completely solve the problem, but it is a fixed improvement. The size of the blocks are limited by the maximum size atomic instruction can operate on, therefore atomic instructions on large words would increase the 64 queues per block limit.
+
+\TODO double check the next sentence
+Preliminary result indicate that the bitmask approach with the BMI2 extension can lead to multi-threaded performance that is contention agnostic in the worst case.
+This result suggests that the contention penalty and the increase performance for additionnal thread cancel each other exactly. This may indicate that a relatively small reduction in contention may tip the performance into positive scalling even for the worst case. It can be noted that in cases of high-contention, the use of the bitmask to find queues that are not empty is much less reliable. Indeed, if contention on the bitmask is high, it means it probably changes significantly between the moment it is read and the actual operation on the queues it represents. Furthermore, the objective of the bitmask is to avoid probing queues that are empty. Therefore, in cases where the bitmask is highly contented, it may be preferrable to probe queues randomly, either until contention decreases or until a prior prefetch of the bitmask completes. Ideally, the scheduler would be able to observe that the bitmask is highly contented and adjust its behaviour appropriately. However, I am not aware of any mechanism to query whether a cacheline is in cache or to run other instructions until a cacheline is fetch without blocking on the cacheline. As such, an alternative that may have a similar impact would be for each thread to have their own bitmask, which would be updated both after each scheduler action and after a certain number of failed probing. If the bitmask has little contention, the local bitmask will be mostly up-to-date and several threads won't need to contend as much on the global bitmask. If the bitmask has significant contention, then fetching it becomes more expensive and threads may as well probe randomly. This solution claims that probing randomly or against an out-of-date bitmask is equivalent.
+
+In cases where this is insufficient, another approach is to use a hiearchical data structure. Creating a tree of nodes to reduce contention has been shown to work in similar cases\cit(SNZI: Scalable NonZero Indicators)\footnote{This particular paper seems to be patented in the US. How does that affect \CFA? Can I use it in my work?}. However, this approach may lead to poorer single-threaded performance due to the inherent pointer chasing, as such, it was not considered as the first approach but as a fallback in case the bitmask approach does not satisfy the performance goals.
+
+Part of this performance relies on contention being low when there are few threads on the readyqueue. However, this can be assumed reliably if the system handles putting idle processors to sleep, which is addressed in section \ref{sleep}.
+
+\paragraph{Objectives and Existing Work}
+How much scalability is actually needed is highly debatable, libfibre\cit is has compared favorably to other schedulers in webserver tests\cit and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask. As such the single atomic instruction on a shared cacheline may be sufficiently performant.
+
+I have built a prototype of this ready-queue (including the bitmask and BMI2 usage, but not the sharded bitmask) and ran performance experiments on it but it is difficult to compare this prototype to a thread scheduler as the prototype is used as a data-queue. I have also integrated this prototype into the \CFA runtime, but have not yet created performance experiments to compare results. I believe that the bitmask approach is currently one of the larger risks of the proposal, early tests lead me to believe it may work but it is not clear that the contention problem can be overcome. The worst-case scenario is a case where the number of processors and the number of ready threads are similar, yet scheduling events are very frequent. Fewer threads should lead to the Idle Sleep mechanism reducing contention while having many threads ready leads to optimal performance. It is difficult to evaluate the likeliness of this worst-case scenario in real workloads. I believe, frequent scheduling events suggest a more ``bursty'' workload where new work is finely divided among many threads which race to completion. This type of workload would only see a peek of contention close to the end of the work, but no sustained contention. Very fine-grained pipelines are less ``bursty'', these may lead to more sustained contention. However, they could also easily benefit from a direct hand-off strategy which would circumvent the problem entirely.
+
+\subsection{Dynamic Resizing}
+The \CFA runtime system currently handles dynamically adding and removing processors from clusters at any time. Since this is part of the existing design, the proposed scheduler must also support this behaviour. However, dynamicly resizing the clusters is considered a rare event associated with setup, teardown and major configuration changes. This assumptions is made both in the design of the proposed scheduler as well as in the original design of the \CFA runtime system. As such, the proposed scheduler must honor the correctness of these behaviour but does not have any performance objectives with regards to resizing a cluster. How long adding or removing processors take and how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long period of times. This description effectively matches with te description of a Reader-Writer lock, in frequent but invasive updates among frequent (mostly) read operations. In the case of the Ready-Queue described above, read operations are operations that push or pop from the ready-queue but do not invalidate any references to the ready queue data structures. Writes on the other-hand would add or remove inner queues, invalidating references to the array of inner queues in the process. Therefore, the current proposed approach to this problem is the add a per-cluster Reader Writer lock around the ready queue to prevent restructuring of the ready-queue data structure while threads are being pushed or popped.
+
+There are possible alternatives to the Reader Writer lock solution. This problem is effectively a memory reclamation problem and as such there is a large body of research on the subject. However, the RWlock solution is simple and can be leveraged to solve other problems (e.g. processor ordering and memory reclamation of threads) which makes it an attractive solution.
+
+\paragraph{Objectives and Existing Work}
+The lock must offer scalability and performance on par with the actual ready-queue in order not to introduce a new bottle neck. I have already built a lock that fits the desired requirements and preliminary testing show scalability and performance that exceed the target. As such, I do not consider this lock to be a risk on this project.
+
+\subsection{Idle Sleep} \label{sleep}
+As mentionned above, idle sleep is the process of putting processors to sleep while they do not have threads to execute. In this context processors are kernel-threads and sleeping refers to asking the kernel to block a thread. This can be achieved with either thread synchronization operations like pthread\_cond\_wait or using signal operations like sigsuspend.
+
+Support for idle sleep broadly involves calling the operating system to block the kernel thread but also handling the race between the sleeping and the waking up, and handling which kernel thread should sleep or wake-up.
+
+When a processor decides to sleep, there is a race that occurs between it signalling that it will go to sleep (so other processors can find sleeping processors) and actually blocking the kernel thread. This is equivalent to the classic problem of missing signals when using condition variables, the ``sleepy'' processor indicates that it will sleep but has not yet gone to sleep, if another processor attempts to wake it up, the waking-up operation may claim nothing needs to be done and the signal will have been missed. In cases where threads are scheduled from processors on the current cluster, loosing signals is not necessarily critical, because at least some processors on the cluster are awake. Individual processors always finish shceduling threads before looking for new work, which means that the last processor to go to sleep cannot miss threads scheduled from inside the cluster (if they do, that demonstrates the ready-queue is not linearizable). However, this guarantee does not hold if threads are shceduled from outside the cluster, either due to an external event like timers and I/O, or due to a thread migrating from a different cluster. In this case, missed signals can lead to the cluster deadlocking where it should not\footnote{Clusters ``should'' never deadlock, but for this proposal, cases where \CFA users \emph{actually} wrote \CFA code that leads to a deadlock it is considered as a deadlock that ``should'' happen. }. Therefore, it is important that the scheduling of threads include a mechanism where signals \emph{cannot} be missed. For performance reasons, it can be advantageous to have a secondary mechanism that allows signals to be missed in cases where it cannot lead to a deadlock. To be safe, this process must include a ``handshake'' where it is guaranteed that either~: the sleepy processor notices that a thread was scheduled after it signalled its intent to block or code scheduling threads well see the intent to sleep before scheduling and be able to wake-up the processor. This matter is complicated by the fact that pthread offers few tools to implement this solution and offers no guarantee of ordering of threads waking up for most of these tools.
+
+Another issues is trying to avoid kernel sleeping and waking frequently. A possible partial solution is to order the processors so that the one which most recently went to sleep is woken up. This allows other sleeping processors to reach deeper sleep state (when these are available) while keeping ``hot'' processors warmer. Note that while this generally means organising the processors in a stack, I believe that the unique index provided by the ReaderWriter lock can be reused to strictly order the waking order of processors, causing a LIFO like waking order. While a strict LIFO stack is probably better, using the processor index could proove useful and offer a sufficiently LIFO ordering.
+
+Finally, another important aspect of Idle Sleep is when should processors make the decision to sleep and when it is appropriate for sleeping processors to be woken up. Processors that are unnecessarily awake lead to unnecessary contention and power consumption, while too many sleeping processors can lead to sub-optimal throughput. Furthermore, transitions from sleeping to awake and vice-versa also add unnecessary latency. There is already a wealth of research on the subject and I do not plan to implement a novel idea for the Idle Sleep heuristic in this project.
+
+\subsection{Asynchronous I/O}
+The final aspect of this proposal is asynchronous I/O. Without it, user threads that execute I/O operations will block the underlying kernel thread. This leads to poor throughput, it would be preferrable to block the user-thread and reuse the underlying kernel-thread to run other ready threads. This requires intercepting the user-threads' calls to I/O operations, redirecting them to an asynchronous I/O interface and handling the multiplexing between the synchronous and asynchronous API. As such, these are the three components needed to implemented to support asynchronous I/O : an OS abstraction layer over the asynchronous interface, an event-engine to (de)multiplex the operations and a synchronous interface for users to use. None of these components currently exist in \CFA and I will need to build all three for this project.
+
+\paragraph{OS Abstraction}
+One of the fundamental part of this converting blocking I/O operations into non-blocking ones. This relies on having an underlying asynchronous I/O interface to which to direct the I/O operations. While there exists many different APIs for asynchronous I/O, it is not part of this proposal to create a novel API, simply to use an existing one that is sufficient. uC++ uses the \texttt{select} as its interface, which handles pipes and sockets. It entails significant complexity and has performances problems which make it a less interesting alternative. Another interface which is becoming popular recently\cit is \texttt{epoll}. However, epoll also does not handle file system and seems to have problem to linux pipes and \texttt{TTY}s\cit. A very recent alternative that must still be investigated is \texttt{io\_uring}. It claims to address some of the issues with \texttt{epoll} but is too recent to be confident that it does. Finally, a popular cross-platform alternative is \texttt{libuv}, which offers asynchronous sockets and asynchronous file system operations (among other features). However, as a full-featured library it includes much more than what is needed and could conflict with other features of \CFA unless significant efforts are made to merge them together.
+
+\paragraph{Event-Engine}
+Laying on top of the asynchronous interface layer is the event-engine. This engine is responsible for multiplexing (batching) the synchronous I/O requests into an asynchronous I/O request and demultiplexing the results onto appropriate blocked threads. This can be straightforward for the simple cases, but can become quite complex. Decisions that will need to be made include : whether to poll from a seperate kernel thread or a regularly scheduled user thread, what should be the ordering used when results satisfy many requests, how to handle threads waiting for multiple operations, etc.
+
+\paragraph{Interface}
+Finally, for these components to be available, it is necessary to expose them through a synchronous interface. This can be a novel interface but it is preferrable to attempt to intercept the existing POSIX interface in order to be compatible with existing code. This will allow C programs written using this interface to be transparently converted to \CFA with minimal effeort. Where this is not applicable, a novel interface will be created to fill the gaps.
+
+
+% ===============================================================================
+% ===============================================================================
+\section{Discussion}
+
+
+% ===============================================================================
+% ===============================================================================
+\section{Timeline}
+
+
+\cleardoublepage
+
+% B I B L I O G R A P H Y
+% -----------------------------
+\addcontentsline{toc}{chapter}{Bibliography}
+\bibliographystyle{plain}
+\bibliography{pl,local}
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
+
+% G L O S S A R Y
+% -----------------------------
+\addcontentsline{toc}{chapter}{Glossary}
+\printglossary
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
+
+\end{document}
Index: doc/theses/thierry_delisle_PhD/comp_II/comp_II_too_big.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/comp_II_too_big.tex	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/comp_II/comp_II_too_big.tex	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,316 @@
+\documentclass[11pt,fullpage]{article}
+\usepackage[T1]{fontenc}
+\usepackage[utf8]{inputenc}
+\usepackage{listings}		% for code listings
+\usepackage{xspace}
+\usepackage{xcolor}
+\usepackage{graphicx}
+\usepackage[hidelinks]{hyperref}
+\usepackage{glossaries}
+\usepackage{textcomp}
+\usepackage{geometry}
+
+% cfa macros used in the document
+\input{common}
+\input{glossary}
+
+\CFAStyle				% use default CFA format-style
+
+\title{
+	\Huge \vspace*{1in} The \CFA Scheduler\\
+	\huge \vspace*{0.25in} PhD Comprehensive II Research Proposal
+	\vspace*{1in}
+}
+
+\author{
+	\huge Thierry Delisle \\
+	\Large \vspace*{0.1in} \texttt{tdelisle@uwaterloo.ca} \\
+	\Large Cheriton School of Computer Science \\
+	\Large University of Waterloo
+}
+
+\date{
+	\today
+}
+
+\begin{document}
+\maketitle
+\cleardoublepage
+
+\newcommand{\cit}{\textsuperscript{[Citation Needed]}\xspace}
+\newcommand{\TODO}{\newline{\large\bf\color{red} TODO :}\xspace}
+
+% ===============================================================================
+% ===============================================================================
+
+\tableofcontents
+
+% ===============================================================================
+% ===============================================================================
+\section{Introduction}
+\subsection{\CFA and the \CFA concurrency package}
+\CFA\cit is a modern, polymorphic, non-object-oriented, backwards-compatible extension of the C programming language. It aims to add high productivity features while maintaning the predictible performance of C. As such concurrency in \CFA\cit aims to offer . Concurrent code is written in the syncrhonous programming paradigm but uses \glspl{uthrd} in order to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programing. As such the \CFA scheduler is a user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}
+
+\subsection{Scheduling for \CFA}
+While the \CFA concurrency package doesn't have any particular scheduling needs beyond those of any concurrency package which uses \glspl{uthrd}, it is important that the default \CFA Scheduler be viable in general. Indeed, since the \CFA Scheduler does not target any specific workloads, it is unrealistic to demand that it use the best scheduling strategy in all cases. However, it should offer a viable ``out of the box'' solution for most scheduling problems so that programmers can quickly write performant concurrent without needed to think about which scheduling strategy is more appropriate for their workload. Indeed, only programmers with exceptionnaly high performance requirements should need to write their own scheduler.
+
+As detailed in Section~\ref{metrics}, schedulers can be evaluated according to multiple metrics. It is therefore important to set objective goals for scheduling according to a high-level direction. As such the design goal for the scheduling strategy can be phrased as follows :
+\begin{quote}
+The \CFA scheduling strategy should be \emph{viable} for any workload.
+\end{quote}
+
+% ===============================================================================
+% ===============================================================================
+\section{Scheduling terms}
+Before going into details about scheduling, it is important to define the terms used in this document, especially since many of these terms are often overloaded and may have subtlely different meaning. All scheduling terms used are defined in the Glossary but the following terms are worth explaining now. \TODO fix the glossary ref
+
+\paragraph{\Gls{proc}:} Designates the abstract worker on which the work will be scheduled. The nature of the \gls{proc} can vary depending on the level at which the mapping is done. For example, OS scheduling maps \glspl{kthrd} onto \glspl{hthrd} and therefore in this context the \gls{proc} refers to the \gls{hthrd}. However, in the context of user space scheduling, the scheduler maps \glspl{uthrd}, \glspl{fiber} or \glspl{job} onto \glspl{kthrd}, at this level the \gls{kthrd} becomes the \gls{proc}.
+
+\paragraph{\Gls{at}:} This document uses the term \gls{at} to designate the units of work that are to be scheduled. Like \glspl{proc}, the nature of a \gls{at} varies depending on the level at which the scheduler operates. The \glspl{at} in OS schedulers are \glspl{kthrd}, while the \glspl{at} in user-space scheduling can be \glspl{uthrd}, \glspl{fiber} or \glspl{job}. The term \gls{at} was chosen specifically to avoid collisions with specific types of \gls{at}, \eg thread, \glspl{uthrd} and \glspl{fiber}.
+
+\paragraph{\Gls{Q}:} A \gls{Q} is a list of \glspl{at} that have been \glslink{at}{scheduled} but have yet to be \glslink{atrun}{run}. The number and nature of the \gls{Q} is scheduler specific and is generally a significant portion of the scheduler design.
+
+\paragraph{\Gls{atsched}:} Designates the act of signalling to the scheduler that some \gls{at} is ready to be executed and should eventually be assigned to a \gls{proc}.
+
+\paragraph{\Gls{atrun}:} Designates the act of act of actually running a \gls{at} that was scheduled. That is mapping the \gls{at} onto a specific \gls{proc} and having the \gls{proc} execute its code.
+
+\paragraph{\Gls{atblock}:} A blocked \gls{at} is an \gls{at} that exists outside of any \gls{Q}. Unless some external entity unblocks it (by \glslink{atsched}{scheduling} it), it will never run and will stay in this suspended state. Not all schedulers support blocking. \eg \gls{job} schedulers do not, a new \gls{job} must be created if the current one would need to wait for an external event.
+
+\paragraph{\Gls{atmig}:} A \gls{at} is generally considered to have migrated when it is \glslink{atrun}{run} from a different different \gls{proc} then the previous time it \glslink{atrun}{ran}. This concept is relevant because migration can often come at a performance cost which is mostly due to caching. However, some amount of migration is generally required to achieve load balancing. In the context of \glspl{job}, since \glspl{job} only run once, migration is define as being \glslink{atrun}{run} from a different different \gls{proc} the the one it was created on.
+
+\paragraph{\Gls{atpass}:} A \gls{at} as overtaken another \gls{at} when it was \glslink{atsched}{scheduled} later but \glslink{atrun}{run} before the other \gls{at}. Overtaking can have performance benefits but has obvious fairness concerns.
+
+% =================================================================================================
+% =================================================================================================
+\section{Previous Work}
+
+\subsection{Publications}
+\subsubsection{Work Stealing}
+A populer scheduling algorithm is \emph{Work Stealing}, which was used by \cite{burton1981executing, halstead1984implementation} to divide work among threads to be done in parallel. This specific flavor of work stealing, which maps units of work to be executed once\footnote{This single execution is what fundamentally distinguishes \emph{Jobs} from \emph{Threads}} onto kernel-threads.
+
+There are countless examples of work-stealing variations \cit, all of which shared the same basic idea : each worker has its own set of work units, referred to as work-queue in this document, and once it runs out, it ``steals'' from  other worker. Many variations of the algorithm exist but they generally have the following characteristics :
+
+\begin{itemize}
+	\item There exists a one-to-one mapping between \glspl{proc} and \glspl{Q}, \eg every thread has exactly one queue of jobs to execute.
+	\item Once added to a \gls{Q}, \glspl{at} can \glslink{atmig}{migrate} to another \gls{proc} only as a result of the \gls{proc}'s \gls{Q} being empty.
+\end{itemize}
+
+Where \glspl{at} are initially enqueued and what to do with unblocking \glspl{at} (which must be requeued) is generally where the variations occur in the algorithms.
+
+Distributing the \glspl{at} improves fairness\cit, while enqueuing new and recurring \glspl{at} on the same work-queue as they creator and previous \glspl{atrun} respectively improves locality\cit.
+
+
+It is worth pointing out that while both strategy can be very effective for certain workloads\cit
+
+\subsubsection{Other-Schedulers}
+
+\subsubsection{Theoretical Bounds}
+
+
+\subsection{In practice}
+In practive meany implementations seem to have converge towards schedulers which use a one to one mapping between \glspl{proc} and \glspl{Q}, often with one or several lower-priority shared \glspl{Q}. This is generally perceived as an improvement over schedulers with a single \glspl{Q} which can lead to contention problems. As mentionned further into this document, this is due to the implicit assumption that the \gls{Q} data structure cannot scale to many cores.
+
+Schedulers with multiple \glspl{Q} generally achieve better throughput can also introduce fairness related problems. The underlying problem is that these \glspl{Q} lead to priorities based on placement rather than per \gls{at}.
+
+\paragraph{Indefinite Starvation}
+A scheduler can have starve a \gls{at} indefinetely. In practice, schedulers without support for priorities can starve \glspl{at} if it does not have preemption and can fall in a stable state where a \gls{at} never \glslink{atblock}{blocks} and other \glspl{at} can get stuck behind it. If these \glspl{at} are never stolen (taken from a \gls{proc} which isn't the one mapped to that \gls{Q}) then they will be indefinetely starved.
+
+\paragraph{Poor load-balancing}
+If the scheduler reaches a stable state where per-\gls{proc} \glspl{Q} have significantly different average lengths (and are never-empty), this can lead to significant unfairness. Indeed, if load-balancing only occurs when a \gls{proc} runs out of work, any stable state where no \gls{proc} runs out of work means that there is no longer any load-balancing while in that state.
+
+\paragraph{Aggressive ``Nice'' \glspl{at}}
+Certain workloads can have ``Nice'' \glspl{at}, that is \glspl{at} which run in the background. These \glspl{at} generally have low amount of work and can run when the system isn't too busy. In systems without priorities, There are several techniques to implement background \glspl{at}, but not every technique may work with every scheduler.
+
+One approach is for the background task to yield every time it makes some small progress. If the yielding \gls{at} is put on a higher-priority \gls{Q} than workstealing, this can cause unfairness and can even cause the background task to fully utilize \gls{hthrd}
+
+A similar approach is to sleep for a short duration instead. However, depending on the details of the sleep, this can also cause problems. It is more likely that sleeping will cause a \gls{proc} to steal because the \gls{at} probably transitions out of the ready state. However, not all system support fine grain sleeping. If the sleeping is to coarse, relying on sleep can cause latency issues.
+
+Finally, this can be solve with explicit synchronization, however not all background tasks can be implemented this way since they don't necessarily have to wait for any ressource.
+
+\subsubsection*{Linux's CFS}
+\subsubsection*{FreeBSD scheduler}
+\subsubsection*{Windows OS Scheduler}
+Windows seems to use priority based scheduling, where the priorities can vary dynamically within some limit. Load-balancing across processors is handle using a work-sharing approch and has different rules based on whether or not there are idle processors.
+
+\subsubsection*{Windows User-Mode Scheduler}
+Windows seems to have M:N scheduling support but does not provide a default user-mode scheduler.
+
+\subsubsection*{Go}
+Go's scheduler uses a work-stealing algorithm without preemption that has a global runqueue(\emph{GRQ}) and each processor(\emph{P}) has a fixed-size runqueue(\emph{LRQ}).
+
+The algorithm is as follows :
+\begin{enumerate}
+	\item Once out of 61 times, directly pick 1 element from the \emph{GRQ}.
+	\item If there is a local next pick it.
+	\item Else pick an item from the \emph{LRQ}.
+	\item If it was empty steal (len(\emph{GRQ}) / \#of\emph{P}) + 1 items (max 256) from the \emph{GRQ}.
+	\item If it was empty steal \emph{half} the \emph{LRQ} of an other \emph{P}.
+\end{enumerate}
+
+Ignoring concerns of power consumption, this structure can lead to odd behaviour. Obviously, the fact that it is non-preemptable means that given a set of goroutines where $N$ of these never block (\eg CPU-bound producers) running on $P$ Processors (\emph{P}'s using the Go naming), if $N >= P$ than not all goroutines are guaranteed to ever make progress. However, this can also be the case even if $N < P$. Indeed, if $P - N$ processors can find a sustained amount of work without needing to steal, then indefinite starvation can still occur.
+
+Excluding cases due to the lack of preemption still leads to odd behavior. The fact that the LRQs have a fixed size can also effect the fairness of scheduling in drastic ways. The separation between \emph{LRQ} and \emph{GRQ} can lead to significant unfairness both in homogenous workloads and more heterogenous workloads.
+
+\subsubsection*{Erlang}
+Erlang uses a work-stealing schedulers with the addition that ``underloaded schedulers will also steal jobs from heavily overloaded schedulers in their migration paths''. Depending on the specifics of this heuristic
+
+\subsubsection*{Haskell}
+
+\subsubsection*{D}
+D does not appear to have an M:N threading model.
+\subsubsection*{Intel\textregistered ~Threading Building Blocks}
+https://software.intel.com/en-us/node/506295
+Intel's concurrency and parallelism library uses a more complicated scheduler which has multiple \glspl{Q} with various priority. However, these \glspl{Q} have a strict priority ordering meaning it is subject to tasks indefinetely starving lower priority tasks if programmers are not careful.
+\TODO test it
+
+\subsubsection*{Quasar}
+
+\subsubsection*{Grand Central Dispatch}
+
+\subsubsection*{LibFiber}
+
+\subsubsection*{Fiber Tasking Lib}
+Advertized in GDC and also uses work-stealing.
+
+
+\subsection{Scheduling Needs}
+Things to look at
+
+libuv : polling loop used for async I/O
+
+julia : uses libuv and has multi-threading
+
+
+
+Direction to look at :
+Static web-servers : mostly-I/O bound
+	- single or few thread, event driven
+	- thread per connections
+
+	examples:
+		- memcached
+		- apache
+		- nodejs stuff
+
+
+HPC workloads : compute bound
+
+
+% ===============================================================================
+% ===============================================================================
+
+\section{Overview of Scheduling for \CFA}
+
+\subsection{Scheduling : Core goals}
+
+\subsection{Requirements of the Scheduler Context}
+
+\subsection{Integration with I/O}
+
+\subsection{Blocking in Style}
+
+
+% ===============================================================================
+% ===============================================================================
+\section{Metrics of Scheduling} \label{metrics}
+Before starting to look into the design of the best scheduler for \CFA, it is important to take some time to detail what metrics to pick for scheduling.
+
+Here are a few metrics that should be considered.
+
+\paragraph{Throughput} is a fairly straightforward metric, it can be measured either in terms of how much time was spent to \glslink{atcomplet}{run to completion} for a fix number of \gls{at} or how many \gls{at} where \glslink{atrun}{ran} for a fix number of work.
+
+These two definitions are virtually interchangeable. However, since scheduling can affect application performance getting valid empirical measures for throughput can be difficult.
+
+\paragraph{Latency} measures how long an individual \gls{at} waited between when it was \glslink{atsched}{scheduled} and it was \glslink{atrun}{run}
+
+\paragraph{Fairness}
+
+\paragraph{Ressource Utilization}
+
+\paragraph{Application Performance}
+
+\section{The Core : Multi-Lane Scheduling}
+\subsection{Objectives}
+While work-stealing works well for both trivial cases, \ie all new \glspl{at} distributed evenly or all on a single work-queue, it handles poorly inbetween cases. As mentionned above, the goal is therefore to create a scheduler that is \emph{viable} for any workloads. More concretely, this means a scheduler that has good scalability and offers guarantees eventual progress. For the purpose of this document, eventual progress is defined as follows:
+
+\begin{itemize}
+	\item Any \Gls{at} that is \glslink{atsched}{scheduled} should eventually \glslink{atrun}{run}, regardless\footnote{In the context of guaranteeing eventual progress, we consider only normal program execution. . Depending on the chosen semantics, normal system shutdown can also prevent \glspl{at} from eventually running without considering the guarantee violated. } of any other \glspl{at} being scheduled (prior or otherwise).
+\end{itemize}
+
+Eventual progress is not guaranteed by work-stealing or work-sharing schedulers in every context. Indeed, as mentionned in \cit, when the system is \glslink{load}{loaded} neither work-stealing nor work-sharing guarantee eventual progress. These aberrant cases can be fixed with \gls{preemption}, they still show a fundamental fairness problem in the algorithm. We can offer a stricter guarantee of eventual progress by limitting the amount of \gls{at} \emph{\glslink{atpass}{overtaking}}. Indeed, cases where eventual progress is lost are cases that show \emph{unbounded} \glslink{atpass}{overtaking} and a such, a scheduler that limits \glslink{atpass}{overtaking} in general guarantees eventual progress.
+
+We can then define the first concrete goal of the \CFA scheduler as :
+\begin{itemize}
+	\item The odds that a \gls{at} is overtaken by $N$ other \glspl{at} decreases rapidly when $N$ increases.
+\end{itemize}
+
+As a second, more fuzzy, objective, the Cforall scheduler should also perform no worst than most existing scheduler for any workload.
+
+\subsection{Ideal description}
+The ideal scheduler should similarly to a multi-lane highway. If all lanes are equally fast, lane changes are to be avoided because they induce traffic jam. However, if some lanes are faster than others than lane changes will help balance the traffic.
+
+Similarly, a task should migrate in two cases:
+\begin{itemize}
+	\item When a worker just emptied its work-queue. (like work-stealing)
+	\item \emph{When a work-unit was overtaken too many times by work units in a different lane}.
+\end{itemize}
+
+
+\subsection{Practical terms}
+In practice, the highway lane analogy has to be adjusted slightly to make it practical. First, the \glspl{at} should always respect the order within a given lane. This means only the task at the front can migrate. This both enforces a stronger FIFO order and means that the scheduler can ignore all \glspl{at} that aren't at the front, simplifying processing. Furthermore, migrating \glspl{at} is only usefull when at least one worker is available, \ie looking for a task to run, because any migration made at any other time will only take effect at that moment.
+
+\subsection{Existing Work}
+
+\subsection{Cforall Today}
+
+
+
+% ===============================================================================
+% ===============================================================================
+\section{The Context : Scheduling in \CFA}
+\subsection{Dynamic Clusters}
+
+\subsection{Idle Sleep}
+
+\subsection{Locality}
+
+% ===============================================================================
+% ===============================================================================
+\section{Asynchronous IO}
+\subsection{Cooperation with the OS}
+
+\subsection{Framework Integration}
+
+% ===============================================================================
+% ===============================================================================
+\section{Blocking in Style}
+\subsection{Monitors and Baton-Passing}
+
+\subsection{Futures and Promises}
+
+
+% ===============================================================================
+% ===============================================================================
+\section{Current Work}
+
+\section{Conclusion}
+
+
+\cleardoublepage
+
+% B I B L I O G R A P H Y
+% -----------------------------
+\addcontentsline{toc}{chapter}{Bibliography}
+\bibliographystyle{plain}
+\bibliography{pl,local}
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
+
+% G L O S S A R Y
+% -----------------------------
+\addcontentsline{toc}{chapter}{Glossary}
+\printglossary
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
+
+\end{document}
Index: doc/theses/thierry_delisle_PhD/comp_II/glossary.tex
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/glossary.tex	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/comp_II/glossary.tex	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,137 @@
+\makeglossaries
+
+\longnewglossaryentry{hthrd}
+{name={hardware thread}}
+{
+Threads representing the underlying hardware directly.
+
+\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+}
+
+\longnewglossaryentry{uthrd}
+{name={user-level thread}}
+{
+Threads created and managed inside user-space. Each thread has its own stack and its own thread of execution. User-level threads are invisible to the underlying operating system.
+
+\textit{Synonyms : User threads, Lightweight threads, Green threads, Virtual threads, Tasks.}
+}
+
+\longnewglossaryentry{kthrd}
+{name={kernel-level thread}}
+{
+Threads created and managed inside kernel-space. Each thread has its own stack and its own thread of execution. Kernel-level threads are owned, managed and scheduled by the underlying operating system.
+
+\textit{Synonyms : OS threads, Hardware threads, Physical threads.}
+}
+
+\longnewglossaryentry{fiber}
+{name={fiber}}
+{
+Fibers are non-preemptive user-level threads. They share most of the caracteristics of user-level threads except that they cannot be preempted by another fiber.
+
+\textit{Synonyms : Tasks.}
+}
+
+\longnewglossaryentry{job}
+{name={job}}
+{
+Unit of work, often sent to a thread pool or worker pool to be executed. Has neither its own stack nor its own thread of execution.
+
+\textit{Synonyms : Tasks.}
+}
+
+\longnewglossaryentry{pool}
+{name={thread-pool}}
+{
+Group of homogeneuous threads that loop executing units of works after another.
+
+\textit{Synonyms : }
+}
+
+\longnewglossaryentry{preemption}
+{name={preemption}}
+{
+Involuntary context switch imposed on threads at a given rate.
+
+\textit{Synonyms : None.}
+}
+
+\longnewglossaryentry{proc}
+{name={virtual processor}}
+{
+
+}
+
+\longnewglossaryentry{Q}
+{name={work-queue}}
+{
+
+}
+
+\longnewglossaryentry{at}
+{name={fred}}
+{
+Abstract object representing an unit of work. Systems will offer one or more concrete implementations of this concept (\eg \gls{kthrd}, \gls{job}), however, most of the concept of schedulings are independent of the particular implementations of the work representation. For this reason, this document use the term \Gls{at} to mean any representation and not one in particular.
+}
+
+\longnewglossaryentry{atsched}
+{name={Scheduling a \gls{at}}}
+{
+Scheduling an \gls{at} refers to the act of notifying the scheduler that a task is ready to be ran. When representing the scheduler as a queue of tasks, scheduling is the act of pushing a task onto the end of the queue. This doesn't necesserily means the task will ever be allocated CPU time (\gls{atrun}), for example, if the system terminates abruptly, scheduled \glspl{at} will probably never run.
+
+\textit{Synonyms : None.}
+}
+
+\longnewglossaryentry{atrun}
+{name={Running a \gls{at}}}
+{
+Running an \gls{at} refers to the act of allocating CPU time to a task that is ready to run. When representing the scheduler as a queue of tasks, running is the act of poping a task from the front of the queue and putting it onto a \gls{proc}. The \gls{at} can than accomplish some or all of the work it is programmed to do.
+
+\textit{Synonyms : None.}
+}
+
+\longnewglossaryentry{atmig}
+{name={migration of \gls{at}}}
+{
+Migration refers to the idea of an \gls{at} running on a different worker/processor than the last time it was run. It is generally preferable to minimise migration as it incurs cost but any load balancing among workers requires some amount of migration.
+
+\textit{Synonyms : None.}
+}
+
+\longnewglossaryentry{atpass}
+{name={overtaking \gls{at}}}
+{
+When representing the scheduler as a queue of \glspl{at}, overtaking is the act breaking the FIFO-ness of the queue by moving a \gls{at} in front of some other \gls{at} when it arrived after. This remains true for schedulers that do not use a FIFO queue, when the order in which the \glspl{at} are \glslink{atsched}{scheduled} and \glslink{atrun}{run} in a different order. A \gls{at} is said to \emph{overtake} another if it is run \emph{before} but was \emph{scheduled} after the other \gls{at}.
+
+\textit{Synonyms : None.}
+}
+
+\longnewglossaryentry{atblock}
+{name={Blocking an \gls{at}}}
+{
+Blocking an abstract task refers to the act of taking a task that us running on a CPU off the CPU. Unless no other task is ready, this action is generally immediately followed by running an other task.
+
+\textit{Synonyms : None.}
+}
+
+\longnewglossaryentry{atcomplet}
+{name={Running to completion}}
+{
+Running to completion refers to the entire sequence of : being scheduled, running and blocking, for a given task.
+
+See also \gls{atsched}, \gls{atrun}, \gls{atblock}
+
+\textit{Synonyms : None.}
+}
+
+\longnewglossaryentry{load}
+{name={System Load}}
+{
+The load is refers to the rate at which \glspl{at} are \glslink{atsched}{scheduled} versus the rate at which they are \glslink{atrun}{run}. When \glspl{at} are being scheduled faster than they are run, the system is considered \emph{overloaded}. When \glspl{at} are being run faster than they are scheduled, the system is considered \emph{underloaded}. Conrrespondingly, if both rates are equal, the system is considered \emph{loaded}. Note that the system is considered loaded only of the rate at which \glspl{at} are scheduled/run is non-zero, otherwise the system is empty, it has no load.
+}
+
+
+\newacronym{tls}{TLS}{Thread Local Storage}
+\newacronym{api}{API}{Application Program Interface}
+\newacronym{raii}{RAII}{Resource Acquisition Is Initialization}
+\newacronym{numa}{NUMA}{Non-Uniform Memory Access}
Index: doc/theses/thierry_delisle_PhD/comp_II/local.bib
===================================================================
--- doc/theses/thierry_delisle_PhD/comp_II/local.bib	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
+++ doc/theses/thierry_delisle_PhD/comp_II/local.bib	(revision 3966d9a34af5c01e9d3ce15fa58f8f8a2875251b)
@@ -0,0 +1,222 @@
+
+Scheduling Multithreaded Computations by Work Stealing
+
+% ===============================================================================
+% Work stealing examples
+% ===============================================================================
+
+% Cilk [4],
+@article{frigo1998implementation,
+  title={The implementation of the Cilk-5 multithreaded language},
+  author={Frigo, Matteo and Leiserson, Charles E and Randall, Keith H},
+  journal={ACM Sigplan Notices},
+  volume={33},
+  number={5},
+  pages={212--223},
+  year={1998},
+  publisher={ACM}
+}
+
+% Cilk-NOW
+@inproceedings{blumofe1997adaptive,
+  title={Adaptive and reliable parallel computing on networks of workstations},
+  author={Blumofe, Robert D and Lisiecki, Philip A and others},
+  booktitle={USENIX 1997 Annual Technical Conference on UNIX and Advanced Computing Systems},
+  pages={133--147},
+  year={1997}
+}
+
+% Intel Threading Building Blocks (TBB)
+@article{intelTBB,
+  title={Intel R Threading Building Blocks Documentation. Sept. 2013},
+  author={Intel, R},
+  journal={URL: https://www.threadingbuildingblocks.org/docs/help/index.htm}
+}
+
+% Java's Fork-join Framework
+@inproceedings{lea2000java,
+  title={A Java fork/join framework},
+  author={Lea, Doug},
+  booktitle={Proceedings of the ACM 2000 conference on Java Grande},
+  pages={36--43},
+  year={2000},
+  organization={ACM}
+}
+
+% Microsoft Task Parallel Library
+@article{leijen2009design,
+  title={The design of a task parallel library},
+  author={Leijen, Daan and Schulte, Wolfram and Burckhardt, Sebastian},
+  journal={Acm Sigplan Notices},
+  volume={44},
+  number={10},
+  pages={227--242},
+  year={2009},
+  publisher={ACM}
+}
+
+% StackThreads/MP
+@book{taura1999stackthreads,
+  title={Stackthreads/mp: integrating futures into calling standards},
+  author={Taura, Kenjiro and Tabata, Kunio and Yonezawa, Akinori},
+  volume={34},
+  number={8},
+  year={1999},
+  publisher={ACM}
+}
+
+@article{feldmann1993game,
+  title={Game tree search on massively parallel systems},
+  author={Feldmann, Rainer and Mysliwietz, P and Monien, B},
+  journal={Advances in Computer Chess},
+  volume={7},
+  year={1993},
+  publisher={Citeseer}
+}
+
+@article{finkel1987dib,
+  title={DIB—a distributed implementation of backtracking},
+  author={Finkel, Raphael and Manber, Udi},
+  journal={ACM Transactions on Programming Languages and Systems (TOPLAS)},
+  volume={9},
+  number={2},
+  pages={235--256},
+  year={1987},
+  publisher={ACM}
+}
+
+@phdthesis{halbherr1994mimd,
+  title={MIMD-style parallel programming based on continuation-passing threads},
+  author={Halbherr, Michael Roland Sven},
+  year={1994},
+  school={ETH Zurich}
+}
+
+@phdthesis{kuszmaul1994synchronized,
+  title={Synchronized MIMD computing},
+  author={Kuszmaul, Bradley Clair},
+  year={1994},
+  school={Massachusetts Institute of Technology}
+}
+
+@article{mohr1991lazy,
+  title={Lazy task creation: A technique for increasing the granularity of parallel programs},
+  author={Mohr, Eric and Kranz, David A and Halstead, Robert H},
+  journal={IEEE transactions on Parallel and Distributed Systems},
+  volume={2},
+  number={3},
+  pages={264--280},
+  year={1991},
+  publisher={IEEE}
+}
+
+@article{vandevoorde1988workcrews,
+  title={WorkCrews: An abstraction for controlling parallelism},
+  author={Vandevoorde, Mark T and Roberts, Eric S},
+  journal={International Journal of Parallel Programming},
+  volume={17},
+  number={4},
+  pages={347--366},
+  year={1988},
+  publisher={Springer}
+}
+
+
+% ===============================================================================
+% Work stealing examples
+% ===============================================================================
+
+%----------
+% discussion about best case and worst case for full computation
+% cited by 1720
+@article{blumofe1999scheduling,
+  title={Scheduling multithreaded computations by work stealing},
+  author={Blumofe, Robert D and Leiserson, Charles E},
+  journal={Journal of the ACM (JACM)},
+  volume={46},
+  number={5},
+  pages={720--748},
+  year={1999},
+  publisher={ACM}
+}
+
+%----------
+% executing process trees in parallel
+% mostly used with lazy evaluation and dependency graphs
+% cited by 267
+% cited by blumofe1999scheduling as first 'work-stealing idea'
+@inproceedings{burton1981executing,
+  title={Executing functional programs on a virtual tree of processors},
+  author={Burton, F Warren and Sleep, M Ronan},
+  booktitle={Proceedings of the 1981 conference on Functional programming languages and computer architecture},
+  pages={187--194},
+  year={1981},
+  organization={ACM}
+}
+
+
+%----------
+% work-stealing used for multilisp.
+% "unfair scheduling policy"
+% cited by 285
+% cited by blumofe1999scheduling as first 'work-stealing idea' implementation
+@inproceedings{halstead1984implementation,
+  title={Implementation of Multilisp: Lisp on a multiprocessor},
+  author={Halstead Jr, Robert H},
+  booktitle={Proceedings of the 1984 ACM Symposium on LISP and functional programming},
+  pages={9--17},
+  year={1984},
+  organization={ACM}
+}
+
+
+% ===============================================================================
+% HPC schedulers with bounds
+% ===============================================================================
+
+%----------
+% discussion about bounds
+% cited by blumofe1999scheduling
+% mentionned as not practical
+% CAN'T find it
+@article{burton1996guaranteeing,
+  title={Guaranteeing good memory bounds for parallel programs},
+  author={Burton, F Warren},
+  journal={IEEE Transactions on software engineering},
+  number={10},
+  pages={762--773},
+  year={1996},
+  publisher={IEEE}
+}
+
+%----------
+% discussion about bounds
+% cited by blumofe1999scheduling
+% mentionned as not practical
+% - from abstract
+% talks about implicit concurrency
+% does not work online or for something other than HPC
+@inproceedings{blelloch1995provably,
+  title={Provably efficient scheduling for languages with fine-grained parallelism},
+  author={Blelloch, Guy E and Gibbons, Phillip B and Matias, Yossi},
+  booktitle={Proceedings of the seventh annual ACM symposium on Parallel algorithms and architectures},
+  pages={1--12},
+  year={1995},
+  organization={ACM}
+}
+
+%----------
+% discussion about bounds
+% cited by blumofe1999scheduling
+% mentionned as not practical
+% - from abstract
+% talks about bound on total time and space
+% no concept of fairness or latency among tasks
+@inproceedings{blelloch1997space,
+  title={Space-efficient scheduling of parallelism with synchronization variables},
+  author={Blelloch, Guy E and Gibbons, Phillip B and Matias, Yossi and Narlikar, Girija J},
+  booktitle={Proceedings of the ninth annual ACM symposium on Parallel algorithms and architectures},
+  pages={12--23},
+  year={1997},
+  organization={ACM}
+}
