Index: doc/proposals/concurrency/.gitignore
===================================================================
--- doc/proposals/concurrency/.gitignore	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/.gitignore	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -16,4 +16,6 @@
 build/*.out
 build/*.ps
+build/*.pstex
+build/*.pstex_t
 build/*.tex
 build/*.toc
Index: doc/proposals/concurrency/Makefile
===================================================================
--- doc/proposals/concurrency/Makefile	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/Makefile	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -13,9 +13,10 @@
 annex/glossary \
 text/intro \
+text/basics \
 text/cforall \
-text/basics \
 text/concurrency \
 text/internals \
 text/parallelism \
+text/results \
 text/together \
 text/future \
@@ -29,6 +30,7 @@
 }}
 
-PICTURES = ${addsuffix .pstex, \
-}
+PICTURES = ${addprefix build/, ${addsuffix .pstex, \
+	system \
+}}
 
 PROGRAMS = ${addsuffix .tex, \
@@ -67,4 +69,5 @@
 	build/*.out     \
 	build/*.ps      \
+	build/*.pstex   \
 	build/*.pstex_t \
 	build/*.tex     \
Index: doc/proposals/concurrency/figures/system.fig
===================================================================
--- doc/proposals/concurrency/figures/system.fig	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
+++ doc/proposals/concurrency/figures/system.fig	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -0,0 +1,164 @@
+#FIG 3.2  Produced by xfig version 3.2.5c
+Landscape
+Center
+Inches
+Letter  
+100.00
+Single
+-2
+1200 2
+6 5175 2700 6150 3737
+3 2 0 4 0 7 49 -1 -1 0.000 1 0 0 10
+	 5475 2702 5625 2777 5325 2852 5625 2927 5325 3002 5625 3077
+	 5325 3152 5625 3227 5325 3302 5475 3377
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 885 5175 3737 Processor N\001
+4 0 0 50 -1 0 11 0.0000 2 120 975 5175 3527 PThread N+2\001
+-6
+6 3300 2700 4140 3737
+3 2 0 4 0 7 49 -1 -1 0.000 1 0 0 10
+	 3600 2702 3750 2777 3450 2852 3750 2927 3450 3002 3750 3077
+	 3450 3152 3750 3227 3450 3302 3600 3377
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 840 3300 3737 Processor 0\001
+4 0 0 50 -1 0 11 0.0000 2 120 735 3300 3527 PThread 2\001
+-6
+6 600 2700 1725 3737
+3 2 0 4 0 7 49 -1 -1 0.000 1 0 0 10
+	 900 2702 1050 2777 750 2852 1050 2927 750 3002 1050 3077
+	 750 3152 1050 3227 750 3302 900 3377
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 1125 600 3737 Main Processor\001
+4 0 0 50 -1 0 11 0.0000 2 120 735 600 3527 PThread 0\001
+-6
+6 2100 2700 2835 3737
+3 2 0 4 0 7 49 -1 -1 0.000 1 0 0 10
+	 2400 2702 2550 2777 2250 2852 2550 2927 2250 3002 2550 3077
+	 2250 3152 2550 3227 2250 3302 2400 3377
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 450 2100 3737 Alarm\001
+4 0 0 50 -1 0 11 0.0000 2 120 735 2100 3527 PThread 1\001
+-6
+6 600 6301 1290 7367
+3 2 0 2 0 7 49 -1 -1 0.000 1 0 0 10
+	 900 6302 1050 6377 750 6452 1050 6527 750 6602 1050 6677
+	 750 6752 1050 6827 750 6902 900 6977
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 150 690 600 7337 int main()\001
+4 0 0 50 -1 0 11 0.0000 2 120 570 600 7127 thread 0\001
+-6
+6 1635 6300 2205 7336
+3 2 0 2 0 7 49 -1 -1 0.000 1 0 0 10
+	 1935 6301 2085 6376 1785 6451 2085 6526 1785 6601 2085 6676
+	 1785 6751 2085 6826 1785 6901 1935 6976
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 570 1635 7126 thread 1\001
+-6
+6 2475 6300 3045 7336
+3 2 0 2 0 7 49 -1 -1 0.000 1 0 0 10
+	 2775 6301 2925 6376 2625 6451 2925 6526 2625 6601 2925 6676
+	 2625 6751 2925 6826 2625 6901 2775 6976
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 570 2475 7126 thread 2\001
+-6
+6 3300 6300 3870 7336
+3 2 0 2 0 7 49 -1 -1 0.000 1 0 0 10
+	 3600 6301 3750 6376 3450 6451 3750 6526 3450 6601 3750 6676
+	 3450 6751 3750 6826 3450 6901 3600 6976
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 570 3300 7126 thread 3\001
+-6
+6 5325 6300 5970 7336
+3 2 0 2 0 7 49 -1 -1 0.000 1 0 0 10
+	 5625 6301 5775 6376 5475 6451 5775 6526 5475 6601 5775 6676
+	 5475 6751 5775 6826 5475 6901 5625 6976
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 645 5325 7126 thread M\001
+-6
+6 4125 6300 4695 7336
+3 2 0 2 0 7 49 -1 -1 0.000 1 0 0 10
+	 4425 6301 4575 6376 4275 6451 4575 6526 4275 6601 4575 6676
+	 4275 6751 4575 6826 4275 6901 4425 6976
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 11 0.0000 2 120 570 4125 7126 thread 4\001
+-6
+1 2 0 1 0 7 50 -1 -1 0.000 1 3.1416 3150 5250 750 450 2400 4800 3900 5700
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 1200 3900 2475 5025
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 3600 3900 3450 4800
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 5550 3900 3825 5025
+2 2 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 5
+	 7125 3525 7575 3525 7575 3975 7125 3975 7125 3525
+2 2 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 5
+	 6975 2175 9525 2175 9525 6000 6975 6000 6975 2175
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 900 6225 2400 5400
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 2100 6225 2625 5550
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 2850 6225 3000 5700
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 3600 6225 3375 5700
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 4350 6300 3675 5625
+2 1 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 2
+	 5625 6225 3900 5400
+2 2 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 5
+	 525 975 1275 975 1275 2625 525 2625 525 975
+2 2 0 1 0 7 50 -1 45 0.000 0 1 -1 0 0 5
+	 3225 975 3975 975 3975 2625 3225 2625 3225 975
+2 2 0 1 0 7 50 -1 45 0.000 0 1 -1 0 0 5
+	 5100 975 5850 975 5850 2625 5100 2625 5100 975
+2 2 0 1 0 7 50 -1 18 0.000 0 1 -1 0 0 5
+	 7125 2325 7575 2325 7575 2775 7125 2775 7125 2325
+2 2 0 1 0 7 50 -1 45 0.000 0 1 -1 0 0 5
+	 7125 2925 7575 2925 7575 3375 7125 3375 7125 2925
+2 2 0 1 0 7 50 -1 45 0.000 0 1 -1 0 0 5
+	 525 7425 1275 7425 1275 9075 525 9075 525 7425
+2 2 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 5
+	 1575 7425 2325 7425 2325 9075 1575 9075 1575 7425
+2 2 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 5
+	 2400 7425 3150 7425 3150 9075 2400 9075 2400 7425
+2 2 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 5
+	 3225 7425 3975 7425 3975 9075 3225 9075 3225 7425
+2 2 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 5
+	 4050 7425 4800 7425 4800 9075 4050 9075 4050 7425
+2 2 0 1 0 7 50 -1 -1 0.000 0 1 -1 0 0 5
+	 5250 7425 6000 7425 6000 9075 5250 9075 5250 7425
+2 1 1 8 0 7 50 -1 -1 4.000 0 0 -1 1 0 2
+	1 1 2.00 180.00 75.00
+	 2400 3900 2775 4800
+2 2 0 1 0 7 50 -1 18 0.000 0 1 -1 0 0 5
+	 2025 2625 2775 2625 2775 975 2025 975 2025 2625
+3 2 0 2 0 7 49 -1 -1 0.000 1 0 0 10
+	 7350 5025 7500 5100 7200 5175 7500 5250 7200 5325 7500 5400
+	 7200 5475 7500 5550 7200 5625 7350 5700
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+3 2 0 4 0 7 49 -1 -1 0.000 1 0 0 10
+	 7350 4125 7500 4200 7200 4275 7500 4350 7200 4425 7500 4500
+	 7200 4575 7500 4650 7200 4725 7350 4800
+	 0.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000 -1.000
+	 -1.000 0.000
+4 0 0 50 -1 0 18 0.0000 2 30 225 4500 3150 ...\001
+4 0 0 50 -1 0 18 0.0000 2 30 225 3750 4500 ...\001
+4 0 0 50 -1 0 11 0.0000 2 120 705 2775 5325 Scheduler\001
+4 0 0 50 -1 0 11 0.0000 2 120 945 7725 2625 Pthread stack\001
+4 0 0 50 -1 0 11 0.0000 2 150 1530 7725 3225 Pthread stack (stolen)\001
+4 0 0 50 -1 0 11 0.0000 2 120 540 7725 4500 Pthread\001
+4 0 0 50 -1 0 11 0.0000 2 150 1065 7725 5400 $\\CFA$ thread\001
+4 0 0 50 -1 0 18 0.0000 2 30 225 4950 6600 ...\001
+4 0 0 50 -1 0 18 0.0000 2 30 225 4200 5850 ...\001
+4 0 0 50 -1 0 11 0.0000 2 150 990 7725 3825 $\\CFA$ stack\001
Index: doc/proposals/concurrency/text/cforall.tex
===================================================================
--- doc/proposals/concurrency/text/cforall.tex	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/text/cforall.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -1,11 +1,11 @@
 % ======================================================================
 % ======================================================================
-\chapter{Cforall crash course}
+\chapter{Cforall Overview}
 % ======================================================================
 % ======================================================================
 
-This thesis presents the design for a set of concurrency features in \CFA. Since it is a new dialect of C, the following is a quick introduction to the language, specifically tailored to the features needed to support concurrency.
+The following is a quick introduction to the \CFA language, specifically tailored to the features needed to support concurrency.
 
-\CFA is a extension of ISO-C and therefore supports all of the same paradigms as C. It is a non-object oriented system language, meaning most of the major abstractions have either no runtime overhead or can be opt-out easily. Like C, the basics of \CFA revolve around structures and routines, which are thin abstractions over machine code. The vast majority of the code produced by the \CFA translator respects memory-layouts and calling-conventions laid out by C. Interestingly, while \CFA is not an object-oriented language, lacking the concept of a received (e.g.: this), it does have some notion of objects\footnote{C defines the term objects as : [Where to I get the C11 reference manual?]}, most importantly construction and destruction of objects. Most of the following pieces of code can be found on the \CFA website \cite{www-cfa}
+\CFA is a extension of ISO-C and therefore supports all of the same paradigms as C. It is a non-object oriented system language, meaning most of the major abstractions have either no runtime overhead or can be opt-out easily. Like C, the basics of \CFA revolve around structures and routines, which are thin abstractions over machine code. The vast majority of the code produced by the \CFA translator respects memory-layouts and calling-conventions laid out by C. Interestingly, while \CFA is not an object-oriented language, lacking the concept of a receiver (e.g., this), it does have some notion of objects\footnote{C defines the term objects as : [Where to I get the C11 reference manual?]}, most importantly construction and destruction of objects. Most of the following code examples can be found on the \CFA website \cite{www-cfa}
 
 \section{References}
Index: doc/proposals/concurrency/text/concurrency.tex
===================================================================
--- doc/proposals/concurrency/text/concurrency.tex	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/text/concurrency.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -221,5 +221,5 @@
 % ======================================================================
 % ======================================================================
-\section{Internal scheduling} \label{insched}
+\section{Internal scheduling} \label{intsched}
 % ======================================================================
 % ======================================================================
@@ -973,2 +973,29 @@
 \label{lst:waitfor2}
 \end{figure}
+
+% ======================================================================
+% ======================================================================
+\subsection{Waiting for the destructor}
+% ======================================================================
+% ======================================================================
+An important exception for the \code{waitfor} statement is destructor semantics. Indeed, the \code{waitfor} statement can accept any \code{mutex} routine, which counts the destructor. However, with the semantics discussed until now, waiting for the destructor does not make any sense since using an object after its destructor is called is undefined behaviour. The simplest approach to fix this hole in the semantics would be disallowing \code{waitfor} on destructor. However, a more expressive approach is to flip ordering of execution when waiting for the destructor, meaning that waiting for the destructor allows the destructor to run after the current \code{mutex} routine, similarly to how a condition is signalled.
+\begin{figure}
+\begin{cfacode}
+monitor Executer {};
+struct  Action;
+
+void ^?{}   (Executer & mutex this);
+void execute(Executer & mutex this, const Action & );
+void run    (Executer & mutex this) {
+	while(true) {
+		   waitfor(execute, this);
+		or waitfor(^?{}   , this) {
+			break;
+		}
+	}
+}
+\end{cfacode}
+\caption{Example of an executor which executes action in series until the destructor is called.}
+\label{lst:dtor-order}
+\end{figure}
+For example, listing \ref{lst:dtor-order} shows an example of an executor with an infinite loop, which waits for the destructor to break out of this loop.
Index: doc/proposals/concurrency/text/future.tex
===================================================================
--- doc/proposals/concurrency/text/future.tex	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/text/future.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -5,82 +5,107 @@
 % ======================================================================
 
-Concurrency and parallelism is still a very active field that strongly benefits from hardware advances. As such certain features that aren't necessarily mature enough in their current state could become relevant in the lifetime of \CFA.
-\section{Non-Blocking IO}
+\section{Flexible Scheduling} \label{futur:sched}
 
 
-\section{Other concurrency tools}
+\section{Non-Blocking IO} \label{futur:nbio}
+While most of the parallelism tools
+However, many modern workloads are not bound on computation but on IO operations, an common case being webservers and XaaS (anything as a service). These type of workloads often require significant engineering around amortising costs of blocking IO operations. While improving throughtput of these operations is outside what \CFA can do as a language, it can help users to make better use of the CPU time otherwise spent waiting on IO operations. The current trend is to use asynchronous programming using tools like callbacks and/or futurs and promises\cit. However, while these are valid solutions, they lead to code that is harder to read and maintain because it is much less linear
 
 
-\section{Implicit threading}
-% Finally, simpler applications can benefit greatly from having implicit parallelism. That is, parallelism that does not rely on the user to write concurrency. This type of parallelism can be achieved both at the language level and at the system level.
-%
-% \begin{center}
-% \begin{tabular}[t]{|c|c|c|}
-% Sequential & System Parallel & Language Parallel \\
-% \begin{lstlisting}
-% void big_sum(int* a, int* b,
-% 		 int* out,
-% 		 size_t length)
-% {
-% 	for(int i = 0; i < length; ++i ) {
-% 		out[i] = a[i] + b[i];
-% 	}
-% }
-%
-%
-%
-%
-%
-% int* a[10000];
-% int* b[10000];
-% int* c[10000];
-% //... fill in a and b ...
-% big_sum(a, b, c, 10000);
-% \end{lstlisting} &\begin{lstlisting}
-% void big_sum(int* a, int* b,
-% 		 int* out,
-% 		 size_t length)
-% {
-% 	range ar(a, a + length);
-% 	range br(b, b + length);
-% 	range or(out, out + length);
-% 	parfor( ai, bi, oi,
-% 	[](int* ai, int* bi, int* oi) {
-% 		oi = ai + bi;
-% 	});
-% }
-%
-% int* a[10000];
-% int* b[10000];
-% int* c[10000];
-% //... fill in a and b ...
-% big_sum(a, b, c, 10000);
-% \end{lstlisting}&\begin{lstlisting}
-% void big_sum(int* a, int* b,
-% 		 int* out,
-% 		 size_t length)
-% {
-% 	for (ai, bi, oi) in (a, b, out) {
-% 		oi = ai + bi;
-% 	}
-% }
-%
-%
-%
-%
-%
-% int* a[10000];
-% int* b[10000];
-% int* c[10000];
-% //... fill in a and b ...
-% big_sum(a, b, c, 10000);
-% \end{lstlisting}
-% \end{tabular}
-% \end{center}
-%
+
+\section{Other concurrency tools} \label{futur:tools}
 
 
-\section{Multiple Paradigms}
+\section{Implicit threading} \label{futur:implcit}
+Simpler applications can benefit greatly from having implicit parallelism. That is, parallelism that does not rely on the user to write concurrency. This type of parallelism can be achieved both at the language level and at the library level. The cannonical example of implcit parallelism is parallel for loops, which are the simplest example of a divide and conquer algorithm\cit. Listing \ref{lst:parfor} shows three different code examples that accomplish pointwise sums of large arrays. Note that none of these example explicitly declare any concurrency or parallelism objects.
+
+\begin{figure}
+\begin{center}
+\begin{tabular}[t]{|c|c|c|}
+Sequential & Library Parallel & Language Parallel \\
+\begin{cfacode}[tabsize=3]
+void big_sum(
+	int* a, int* b,
+	int* o,
+	size_t len)
+{
+	for(
+		int i = 0;
+		i < len;
+		++i )
+	{
+		o[i]=a[i]+b[i];
+	}
+}
 
 
-\section{Transactions}
+
+
+
+int* a[10000];
+int* b[10000];
+int* c[10000];
+//... fill in a & b
+big_sum(a,b,c,10000);
+\end{cfacode} &\begin{cfacode}[tabsize=3]
+void big_sum(
+	int* a, int* b,
+	int* o,
+	size_t len)
+{
+	range ar(a, a+len);
+	range br(b, b+len);
+	range or(o, o+len);
+	parfor( ai, bi, oi,
+	[](	int* ai,
+		int* bi,
+		int* oi)
+	{
+		oi=ai+bi;
+	});
+}
+
+
+int* a[10000];
+int* b[10000];
+int* c[10000];
+//... fill in a & b
+big_sum(a,b,c,10000);
+\end{cfacode}&\begin{cfacode}[tabsize=3]
+void big_sum(
+	int* a, int* b,
+	int* o,
+	size_t len)
+{
+	parfor (ai,bi,oi)
+	    in (a, b, o )
+	{
+		oi = ai + bi;
+	}
+}
+
+
+
+
+
+
+
+int* a[10000];
+int* b[10000];
+int* c[10000];
+//... fill in a & b
+big_sum(a,b,c,10000);
+\end{cfacode}
+\end{tabular}
+\end{center}
+\caption{For loop to sum numbers: Sequential, using library parallelism and language parallelism.}
+\label{lst:parfor}
+\end{figure}
+
+Implicit parallelism is a general solution and therefore is
+
+\section{Multiple Paradigms} \label{futur:paradigms}
+
+
+\section{Transactions} \label{futur:transaction}
+Concurrency and parallelism is still a very active field that strongly benefits from hardware advances. As such certain features that aren't necessarily mature enough in their current state could become relevant in the lifetime of \CFA.
Index: doc/proposals/concurrency/text/internals.tex
===================================================================
--- doc/proposals/concurrency/text/internals.tex	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/text/internals.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -1,20 +1,50 @@
 
 \chapter{Behind the scene}
-
-
-% ======================================================================
-% ======================================================================
-\section{Implementation Details: Interaction with polymorphism}
-% ======================================================================
-% ======================================================================
-Depending on the choice of semantics for when monitor locks are acquired, interaction between monitors and \CFA's concept of polymorphism can be complex to support. However, it is shown that entry-point locking solves most of the issues.
-
-First of all, interaction between \code{otype} polymorphism and monitors is impossible since monitors do not support copying. Therefore, the main question is how to support \code{dtype} polymorphism. Since a monitor's main purpose is to ensure mutual exclusion when accessing shared data, this implies that mutual exclusion is only required for routines that do in fact access shared data. However, since \code{dtype} polymorphism always handles incomplete types (by definition), no \code{dtype} polymorphic routine can access shared data since the data requires knowledge about the type. Therefore, the only concern when combining \code{dtype} polymorphism and monitors is to protect access to routines.
-
-Before looking into complex control-flow, it is important to present the difference between the two acquiring options : callsite and entry-point locking, i.e. acquiring the monitors before making a mutex routine call or as the first operation of the mutex routine-call. For example:
+There are several challenges specific to \CFA when implementing concurrency. These challenges are direct results of \gls{bulk-acq} and loose object definitions. These two constraints are to root cause of most design decisions in the implementation. Furthermore, to avoid the head-aches of dynamically allocating memory in a concurrent environment, the internal-scheduling design is (almost) entirely free of mallocs and other dynamic memory allocation scheme. This is to avoid the chicken and egg problem \cite{Chicken} of having a memory allocator that relies on the threading system and a threading system that relies on the runtime. This extra goal, means that memory management is a constant concern in the design of the system.
+
+The main memory concern for concurrency is queues. All blocking operations are made by parking threads onto queues. These queues need to be intrinsic\cit to avoid the need memory allocation. This entails that all the fields needed to keep track of all needed information. Since many conconcurrency operations can use an unbound amount of memory (depending on \gls{bulk-acq}) statically defining information in the intrusive fields of threads is insufficient. The only variable sized container that does not require memory allocation is the callstack, which is heavily used in the implementation of internal scheduling. Particularly the GCC extension variable length arrays which is used extensively.
+
+Since stack allocation is based around scope, the first step of the implementation is to identify the scopes that are available to store the information, and which of these can have a variable length. The threads and the condition both allow a fixed amount of memory to be stored, while mutex-routines and the actual blocking call allow for an unbound amount (though the later is preferable in terms of performance).
+
+Note that since the major contributions of this thesis are extending monitor semantics to \gls{bulk-acq} and loose object definitions, any challenges that are not resulting of these characteristiques of \CFA are consired as problems which have already been solved and therefore will not be discussed further.
+
+% ======================================================================
+% ======================================================================
+\section{Mutex routines}
+% ======================================================================
+% ======================================================================
+
+The first step towards the monitor implementation is simple mutex-routines using monitors. In the single monitor case, this is done using the entry/exit procedure highlighted in listing \ref{lst:entry1}. This entry/exit procedure doesn't actually have to be extended to support multiple monitors, indeed it is sufficient to enter/leave monitors one-by-one as long as the order is correct to prevent deadlocks\cit. In \CFA, ordering of monitor relies on memory ordering, this is sufficient because all objects are guaranteed to have distinct non-overlaping memory layouts and mutual-exclusion for a monitor is only defined for its lifetime, meaning that destroying a monitor while it is acquired is undefined behavior. When a mutex call is made, the concerned monitors are agregated into an variable-length pointer array and sorted based on pointer values. This array is concerved during the entire duration of the mutual-exclusion and it's ordering reused extensively.
 \begin{figure}
-\label{fig:locking-site}
+\begin{multicols}{2}
+Entry
+\begin{pseudo}
+if monitor is free
+	enter
+elif already own the monitor
+	continue
+else
+	block
+increment recursions
+\end{pseudo}
+\columnbreak
+Exit
+\begin{pseudo}
+decrement recursion
+if recursion == 0
+	if entry queue not empty
+		wake-up thread
+\end{pseudo}
+\end{multicols}
+\caption{Initial entry and exit routine for monitors}
+\label{lst:entry1}
+\end{figure}
+
+\subsection{ Details: Interaction with polymorphism}
+Depending on the choice of semantics for when monitor locks are acquired, interaction between monitors and \CFA's concept of polymorphism can be more complex to support. However, it is shown that entry-point locking solves most of the issues.
+
+First of all, interaction between \code{otype} polymorphism and monitors is impossible since monitors do not support copying. Therefore, the main question is how to support \code{dtype} polymorphism. It is important to present the difference between the two acquiring options : callsite and entry-point locking, i.e. acquiring the monitors before making a mutex routine call or as the first operation of the mutex routine-call. For example:
+\begin{figure}[H]
 \begin{center}
-\setlength\tabcolsep{1.5pt}
 \begin{tabular}{|c|c|c|}
 Mutex & \gls{callsite-locking} & \gls{entry-point-locking} \\
@@ -67,8 +97,8 @@
 \end{center}
 \caption{Callsite vs entry-point locking for mutex calls}
-\end{figure}
-
-
-Note the \code{mutex} keyword relies on the type system, which means that in cases where a generic monitor routine is actually desired, writing a mutex routine is possible with the proper trait, which is possible because monitors are designed in terms a trait. For example:
+\label{fig:locking-site}
+\end{figure}
+
+Note the \code{mutex} keyword relies on the type system, which means that in cases where a generic monitor routine is actually desired, writing a mutex routine is possible with the proper trait, for example:
 \begin{cfacode}
 //Incorrect: T is not a monitor
@@ -81,17 +111,68 @@
 \end{cfacode}
 
-
-% ======================================================================
-% ======================================================================
-\section{Internal scheduling: Implementation} \label{inschedimpl}
-% ======================================================================
-% ======================================================================
-There are several challenges specific to \CFA when implementing internal scheduling. These challenges are direct results of \gls{bulk-acq} and loose object definitions. These two constraints are to root cause of most design decisions in the implementation of internal scheduling. Furthermore, to avoid the head-aches of dynamically allocating memory in a concurrent environment, the internal-scheduling design is entirely free of mallocs and other dynamic memory allocation scheme. This is to avoid the chicken and egg problem \cite{Chicken} of having a memory allocator that relies on the threading system and a threading system that relies on the runtime. This extra goal, means that memory management is a constant concern in the design of the system.
-
-The main memory concern for concurrency is queues. All blocking operations are made by parking threads onto queues. These queues need to be intrinsic\cit to avoid the need memory allocation. This entails that all the fields needed to keep track of all needed information. Since internal scheduling can use an unbound amount of memory (depending on \gls{bulk-acq}) statically defining information information in the intrusive fields of threads is insufficient. The only variable sized container that does not require memory allocation is the callstack, which is heavily used in the implementation of internal scheduling. Particularly the GCC extension variable length arrays which is used extensively.
-
-Since stack allocation is based around scope, the first step of the implementation is to identify the scopes that are available to store the information, and which of these can have a variable length. In the case of external scheduling, the threads and the condition both allow a fixed amount of memory to be stored, while mutex-routines and the actual blocking call allow for an unbound amount (though adding too much to the mutex routine stack size can become expansive faster).
-
-The following figure is the traditionnal illustration of a monitor :
+Both entry-point and callsite locking are valid implementations. The current \CFA implementations uses entry-point locking because it seems to require less work if done using \gls{raii}, effectively transferring the burden of implementation to object construction/destruction. The same could be said of callsite locking, the difference being that the later does not necessarily have an existing scope that matches exactly the scope of the mutual exclusion, i.e.: the function body.
+
+% ======================================================================
+% ======================================================================
+\section{Threading} \label{impl:thread}
+% ======================================================================
+% ======================================================================
+
+Figure \ref{fig:system1} shows a high-level picture if the \CFA runtime system in regards to concurrency.
+
+\begin{figure}
+\begin{center}
+{\resizebox{\textwidth}{!}{\input{system.pstex_t}}}
+\end{center}
+\caption{Overview of the entire system}
+\label{fig:system1}
+\end{figure}
+
+\subsection{Context Switching}
+As mentionned in section \ref{coroutine}, coroutines are a stepping stone for implementing threading. This is because they share the same mechanism for context-switching between different stacks. To improve performance and simplicity, context-switching is implemented using the following assumption: all context-switches happen inside a specific function call. This assumptions means that the basic recipe for context-switch is only to copy all callee-saved registers unto the stack and then switch the stack registers with the ones of the target coroutine/thread. Note that instruction pointer can be left untouched since the context-switch always inside the same function. In the case of coroutines, that is the entire story. Threads however do not simply context-switch between each other directly. The context-switch to processors which is where the scheduling happens. This method is called a 2-step context-switch and has the advantage of having a clear distinction between user code and the "kernel" where scheduling and other system operation happen. Obiously, this has the cost of doubling the context-switch cost from because threads must context-switch to an intermediate stack. However, the performance of the 2-step context-switch is still superior to a \code{pthread_yield}(see section \ref{results}). additionally, for users in need for optimal performance, it is important to note that having a 2-step context-switch as the default does not prevent \CFA from offering a 1-step context-switch to use manually (or as part of monitors). This option is not currently present in \CFA but the changes required to add it are strictly additive.
+
+\subsection{Processors}
+Parallelism in \CFA are built around using processors to specify how much parallelism is desired. \CFA processors are object wrappers around kernel threads, specifically pthreads in the current implementation of \CFA. Indeed, any parallelism must go through operatiing system librairies. However, \gls{cfathread} are still the main source of concurrency, processors are simply the underlying source of parallelism. Indeed, processor kernel threads simply fetch a user-level thread from the scheduler and run, they are effectively executers for user-threads. The main benefit of this approach is that it offers a well defined boundary between kernel code and user-code, for example kernel thread quiescing, scheduling and interrupt handling. Processors internally use coroutines to take advantage of the existing context-switching semantics.
+
+\subsection{Stack management}
+One of the challenges of this system is to reduce the footprint as much as possible. Specifically, all pthreads created also have a stack created with them, which should be used as much as possible. Normally, coroutines also create there own stack to run on, however, in the case of the coroutines used for processors, these coroutines run directly on the kernel thread stack, effectively stealing the processor stack. The exception to this rule is the Main Processor, i.e. the initial kernel thread that is given to any program. In order to respect user expectations, the stack of the initial kernel thread, the main stack of the program, is used by the main user thread rather than the main processor.
+
+\subsection{Preemption}
+Finally, an important aspect for any complete threading system is preemption. As mentionned in chapter \ref{basics}, preemption introduces an extra degree of unceretainty, which enables users to have multiple threads interleave transparrently between eachother, rather than having to cooperate between thread for proper scheduling and CPU distribution. Indeed, preemption is desireable because it adds a degree of isolation between tasks. In a fully cooperative system, any thread that runs into a long loop can starve other threads, while in a preemptive system starvation can still occur but it does not rely on every thread having to yield or block on a regular basis, which reduces significantly programmer burden. Obviously, preemption is not optimal for every workload, however any preemptive system can become a cooperative system by making the time-slices extremely large. Which is why \CFA uses a preemptive threading system.
+
+Preemption in \CFA is based on kernel timers which are used to run a discreet event simulation. Every processor keeps track of the current time and registers an expiration time with the preemption system. When the preemption system receives a change in preemption it sorts these expiration times in a list and sets a kernel timer for the closest one, effectiveling stepping between preemption events on each signals sent by the timer. These timers use the linux signal {\tt SIGALRM}, which is delivered to the process. This is important because when delivering signals to a process, the kernel documentation states that the signal can be delivered to any kernel thread for which the signal isn't block i.e. :
+\begin{quote}
+A process-directed signal may be delivered to any one of the threads that does not currently have the signal blocked. If more than one of the threads has the signal unblocked, then the kernel chooses an arbitrary thread to which to deliver the signal.
+SIGNAL(7) - Linux Programmer's Manual
+\end{quote}
+For the sake of simplicity and in order to prevent the case of having two threads receiving alarms simultaneously, \CFA programs block the {\tt SIGALRM} signal on every thread except one. Now because of how involontary context-switches are handled, the kernel thread handling {\tt SIGALRM} cannot also be a processor thread.
+
+Involontary context-switching is done by sending {\tt SIGUSER1} to the corresponding processor and having the thread yield from inside the signal handler. Effectively context-switch away from the signal-handler back to the kernel and the signal-handler frame will be unwound when the thread is scheduled again. This means that a signal-handler can start on one kernel thread and terminate on a second kernel thread (but the same user thread). It is important to note that signal-handlers save and restore signal masks because user-thread migration can cause signal mask to migrate from one kernel thread to another. This is only a problem if all kernel threads among which a user thread can migrate differ in terms of signal masks. However, since the kernel thread hanlding preemption requires a different signal mask, executing user threads on the kernel alarm thread can cause deadlocks. For this reason, the alarm thread is on a tight loop around a system call to \code{sigwait} or more specifically \code{sigwaitinfo}, requiring very little CPU time for preemption. One final detail about the alarm thread is how to wake it when additional communication is required (e.g. on thread termination). This is also done using {\tt SIGALRM}, but sent throught the \code{pthread_sigqueue}. Indeed, \code{sigwait} can differentiate signals sent from \code{pthread_sigqueue} from signals sent from alarms or the kernel.
+
+\subsection{Scheduler} \footnote{ I'm not sure what to write here, is this section even needed. }
+Finally, an aspect that was not mentionned yet is the scheduling algorithm. Currently, the \CFA scheduler uses a single ready queue for all processors. Will this is not the highest performance algorithm, it has the significant advantage of being robust to heterogenous workloads. This is a very simple scheduling approach but is sufficient to for the context of this thesis.
+
+What to do here?
+
+However, when
+As will be mentionned \ref{futur:sched} it needs to be updated when clusters will be
+
+clusters
+
+
+
+Among the most pressing updates to the \CFA
+uses single queue
+in future should move to multiple queues with workstealing
+general purpouse means robust > fast
+worksharing can higher standard deviation in performance
+
+
+% ======================================================================
+% ======================================================================
+\section{Internal scheduling} \label{impl:intsched}
+% ======================================================================
+% ======================================================================
+To ease the understanding of monitors, like many other concepts, they are generelly represented graphically. While non-scheduled monitors are simple enough for a graphical representation to be useful, internal scheduling is complex enough to justify a visual representation. The following figure is the traditionnal illustration of a monitor :
 
 \begin{center}
@@ -99,5 +180,5 @@
 \end{center}
 
-For \CFA, the previous picture does not have support for blocking multiple monitors on a single condition. To support \gls{bulk-acq} two changes to this picture are required. First, it doesn't make sense to tie the condition to a single monitor since blocking two monitors as one would require arbitrarily picking a monitor to hold the condition. Secondly, the object waiting on the conditions and AS-stack cannot simply contain the waiting thread since a single thread can potentially wait on multiple monitors. As mentionned in section \ref{inschedimpl}, the handling in multiple monitors is done by partially passing, which entails that each concerned monitor needs to have a node object. However, for waiting on the condition, since all threads need to wait together, a single object needs to be queued in the condition. Moving out the condition and updating the node types yields :
+This picture has several components, the two most important being the entry-queue and the AS-stack. The entry-queue is a (almost) FIFO list where threads waiting to enter are parked, while the AS-stack is a FILO list used for threads that have been signaled or otherwise marked as running next. For \CFA, the previous picture does not have support for blocking multiple monitors on a single condition. To support \gls{bulk-acq} two changes to this picture are required. First, it doesn't make sense to tie the condition to a single monitor since blocking two monitors as one would require arbitrarily picking a monitor to hold the condition. Secondly, the object waiting on the conditions and AS-stack cannot simply contain the waiting thread since a single thread can potentially wait on multiple monitors. As mentionned in section \ref{intsched}, the handling in multiple monitors is done by partially passing, which entails that each concerned monitor needs to have a node object. However, for waiting on the condition, since all threads need to wait together, a single object needs to be queued in the condition. Moving out the condition and updating the node types yields :
 
 \begin{center}
@@ -105,14 +186,13 @@
 \end{center}
 
-\newpage
-
-This picture and the proper entry and leave algorithms is the fundamental implementation of internal scheduling.
-
+This picture and the proper entry and leave algorithms is the fundamental implementation of internal scheduling (see listing \ref{lst:entry2}).
+
+\begin{figure}[b]
 \begin{multicols}{2}
 Entry
-\begin{pseudo}[numbers=left]
+\begin{pseudo}
 if monitor is free
 	enter
-elif I already own the monitor
+elif already own the monitor
 	continue
 else
@@ -123,5 +203,5 @@
 \columnbreak
 Exit
-\begin{pseudo}[numbers=left, firstnumber=8]
+\begin{pseudo}
 decrement recursion
 if recursion == 0
@@ -135,15 +215,105 @@
 \end{pseudo}
 \end{multicols}
-
-Some important things to notice about the exit routine. The solution discussed in \ref{inschedimpl} can be seen on line 11 of the previous pseudo code. Basically, the solution boils down to having a seperate data structure for the condition queue and the AS-stack, and unconditionally transferring ownership of the monitors but only unblocking the thread when the last monitor has trasnferred ownership. This solution is safe as well as preventing any potential barging.
-
-% ======================================================================
-% ======================================================================
-\section{Implementation Details: External scheduling queues}
-% ======================================================================
-% ======================================================================
-To support multi-monitor external scheduling means that some kind of entry-queues must be used that is aware of both monitors. However, acceptable routines must be aware of the entry queues which means they must be stored inside at least one of the monitors that will be acquired. This in turn adds the requirement a systematic algorithm of disambiguating which queue is relavant regardless of user ordering. The proposed algorithm is to fall back on monitors lock ordering and specify that the monitor that is acquired first is the lock with the relevant entry queue. This assumes that the lock acquiring order is static for the lifetime of all concerned objects but that is a reasonable constraint. This algorithm choice has two consequences, the entry queue of the highest priority monitor is no longer a true FIFO queue and the queue of the lowest priority monitor is both required and probably unused. The queue can no longer be a FIFO queue because instead of simply containing the waiting threads in order arrival, they also contain the second mutex. Therefore, another thread with the same highest priority monitor but a different lowest priority monitor may arrive first but enter the critical section after a thread with the correct pairing. Secondly, since it may not be known at compile time which monitor will be the lowest priority monitor, every monitor needs to have the correct queues even though it is probable that half the multi-monitor queues will go unused for the entire duration of the program.
-
-
-\section{Internals}
-The complete mask can be pushed to any one, we are in a context where we already have full ownership of (at least) every concerned monitor and therefore monitors will refuse all calls no matter what.
+\caption{Entry and exit routine for monitors with internal scheduling}
+\label{lst:entry2}
+\end{figure}
+
+Some important things to notice about the exit routine. The solution discussed in \ref{intsched} can be seen in the exit routine of listing \ref{lst:entry2}. Basically, the solution boils down to having a seperate data structure for the condition queue and the AS-stack, and unconditionally transferring ownership of the monitors but only unblocking the thread when the last monitor has transferred ownership. This solution is deadlock safe as well as preventing any potential barging.
+
+The data structure used for the AS-stack are reused extensively for external scheduling, but in the case of internal scheduling, the data is allocated using variable-length arrays on the callstack of the \code{wait} and \code{signal_block} routines.
+
+% ======================================================================
+% ======================================================================
+\section{External scheduling}
+% ======================================================================
+% ======================================================================
+Similarly to internal scheduling, external scheduling for multiple monitors relies on the idea that entry-queues are no longer specific to a single monitor, as mentionned in section \ref{extsched}. This means that some kind of entry-queues must be used that is aware of both monitors and which holds threads that are currently waiting to enter the critical section. This challenge is solved for internal scheduling by having the entry-queues in conditions no longer be tied to a monitor, effectively allowing conditions to be moved outside of monitors. However, in the case of external scheduling, acceptable routines must be aware of the entry queues, which means they must be stored inside at least one of the monitors that will be acquired. This in turn adds the requirement that a systematic algorithm of disambiguating which monitor holds the relevant queue regardless of user ordering. The proposed algorithm is to fall back on monitor lock ordering and specify that the monitor that is acquired first is the one with the relevant entry queue. This assumes that the lock acquiring order is static for the lifetime of all concerned objects but that is a reasonable constraint.
+
+This algorithm choice has two consequences, the entry queue of the highest priority monitor is no longer a true FIFO queue and the queue of the lowest priority monitor is both required and probably unused. The queue can no longer be a FIFO queue because instead of simply containing the waiting threads in order of arrival, they also contain a set of monitors. Therefore, another thread whos set contains the same highest priority monitor but different lower priority monitors may arrive first but enter the critical section after a thread with the correct pairing. Secondly, since it is not known at compile time which monitor will be the lowest priority monitor, every monitor needs to have the correct queues even though it is probable that some queues will go unused for the entire duration of the program, for example if a monitor is only used in a pair.
+
+Therefore, the following modifications need to be made to support external scheduling :
+\begin{itemize}
+	\item The threads waiting on the entry-queue need to keep track of which routine is trying to enter, and using which set of monitors. The \code{mutex} routine already has all the required information on it's stack so the thread only needs to keep a pointer to that information.
+	\item The monitors need to keep a mask of acceptable routines. This mask contains for each acceptable routine, a routine pointer and an array of monitors to go with it. It also needs storage to keep track of which routine was accepted. Since this information is not specific to any monitor, the monitors actually contain a pointer to an integer on the stack of the waiting thread. Note that the complete mask can be pushed to any owned monitors, regardless of \code{when} statements, the \code{waitfor} statement is used in a context where the thread already has full ownership of (at least) every concerned monitor and therefore monitors will refuse all calls no matter what.
+	\item The entry/exit routine need to be updated as shown in listing \ref{lst:entry3}.
+\end{itemize}
+
+Finally, to support the ordering inversion of destructors, the code generation needs to be modified to use a special entry routine. This routine is needed because of the storage requirements of the call order inversion. Indeed, when waiting for the destructors, storage is need for the waiting context and the lifetime of said storage needs to outlive the waiting operation it is needed for. For regular \code{waitfor} statements, the callstack of the routine itself matches this requirement but it is no longer the case when waiting for the destructor since it is pushed on to the AS-stack for later. The waitfor semantics can then be adjusted correspondingly, as seen in listing \ref{lst:entry-dtor}
+
+\begin{figure}
+\begin{multicols}{2}
+Entry
+\begin{pseudo}
+if monitor is free
+	enter
+elif already own the monitor
+	continue
+elif matches waitfor mask
+	push waiter to AS-stack
+	continue
+else
+	block
+increment recursion
+\end{pseudo}
+\columnbreak
+Exit
+\begin{pseudo}
+decrement recursion
+if recursion == 0
+	if signal_stack not empty
+		set_owner to thread
+		if all monitors ready
+			wake-up thread
+
+	if entry queue not empty
+		wake-up thread
+\end{pseudo}
+\end{multicols}
+\caption{Entry and exit routine for monitors with internal scheduling and external scheduling}
+\label{lst:entry3}
+\end{figure}
+
+\begin{figure}
+\begin{multicols}{2}
+Destructor Entry
+\begin{pseudo}
+if monitor is free
+	enter
+elif already own the monitor
+	increment recursion
+	return
+create wait context
+if matches waitfor mask
+	reset mask
+	push self to AS-stack
+	baton pass
+else
+	wait
+increment recursion
+\end{pseudo}
+\columnbreak
+Waitfor
+\begin{pseudo}
+lock all monitors
+if matching thread is already there
+	if found destructor
+		push destructor to AS-stack
+		unlock all monitors
+	else
+		push self to AS-stack
+		baton pass
+	return
+
+if non-blocking
+	Unlock all monitors
+	Return
+
+push self to AS-stack
+set waitfor mask
+block
+return
+\end{pseudo}
+\end{multicols}
+\caption{Pseudo code for the \code{waitfor} routine and the \code{mutex} entry routine for destructors}
+\label{lst:entry-dtor}
+\end{figure}
Index: doc/proposals/concurrency/text/parallelism.tex
===================================================================
--- doc/proposals/concurrency/text/parallelism.tex	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/text/parallelism.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -28,8 +28,5 @@
 While the choice between the three paradigms listed above may have significant performance implication, it is difficult to pindown the performance implications of chosing a model at the language level. Indeed, in many situations one of these paradigms may show better performance but it all strongly depends on the workload. Having a large amount of mostly independent units of work to execute almost guarantess that the \gls{pool} based system has the best performance thanks to the lower memory overhead (i.e., not thread stack per job). However, interactions among jobs can easily exacerbate contention. User-level threads allow fine-grain context switching, which results in better resource utilisation, but a context switch is more expensive and the extra control means users need to tweak more variables to get the desired performance. Finally, if the units of uninterrupted work are large enough the paradigm choice is largely amortised by the actual work done.
 
-\TODO
-
 \section{The \protect\CFA\ Kernel : Processors, Clusters and Threads}\label{kernel}
-
 
 \subsection{Future Work: Machine setup}\label{machine}
Index: doc/proposals/concurrency/text/results.tex
===================================================================
--- doc/proposals/concurrency/text/results.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
+++ doc/proposals/concurrency/text/results.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -0,0 +1,127 @@
+% ======================================================================
+% ======================================================================
+\chapter{Performance results}
+% ======================================================================
+% ======================================================================
+
+\section{Machine setup}
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{| l | r | l | r |}
+\hline
+Architecture		& x86\_64 			& NUMA node(s) 	& 8 \\
+\hline
+CPU op-mode(s)		& 32-bit, 64-bit 		& Model name 	& AMD Opteron\texttrademark  Processor 6380 \\
+\hline
+Byte Order			& Little Endian 		& CPU Freq 		& 2.5\si{\giga\hertz} \\
+\hline
+CPU(s)			& 64 				& L1d cache 	& \SI{16}{\kibi\byte} \\
+\hline
+Thread(s) per core	& 2 				& L1i cache 	& \SI{64}{\kibi\byte} \\
+\hline
+Core(s) per socket	& 8 				& L2 cache 		& \SI{2048}{\kibi\byte} \\
+\hline
+Socket(s)			& 4 				& L3 cache 		& \SI{6144}{\kibi\byte} \\
+\hline
+\hline
+Operating system		& Ubuntu 16.04.3 LTS	& Kernel		& Linux 4.4.0-97-generic \\
+\hline
+Compiler			& gcc 6.3.0 		& Translator	& CFA 1.0.0 \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Machine setup used for the tests}
+\label{tab:machine}
+\end{figure}
+
+\section{Micro benchmarks}
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
+\cline{2-4}
+\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
+\hline
+Kernel Threads		& 239		& 242.57	& 5.54 \\
+\CFA Coroutines		& 38		& 38		& 0    \\
+\CFA Threads		& 102		& 102.39	& 1.57 \\
+$\mu$++ Coroutines	& 46		& 46.68	& 0.47 \\
+$\mu$++ Threads		& 98		& 99.39	& 1.52 \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Context Switch comparaison. All numbers are in nanoseconds(\si{\nano\second})}
+\label{tab:ctx-switch}
+\end{figure}
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
+\cline{2-4}
+\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
+\hline
+C routine						& 2		& 2		& 0      \\
+Pthreads Mutex Lock				& 31		& 31.86	& 0.99   \\
+$\mu$++ \code{monitor} member routine	& 30		& 30		& 0      \\
+\CFA \code{mutex} routine, 1 argument	& 46		& 46.14	& 0.74   \\
+\CFA \code{mutex} routine, 2 argument	& 82		& 83		& 1.93   \\
+\CFA \code{mutex} routine, 4 argument	& 165		& 161.15	& 54.04  \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Mutex routine comparaison. All numbers are in nanoseconds(\si{\nano\second})}
+\label{tab:mutex}
+\end{figure}
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
+\cline{2-4}
+\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
+\hline
+$\mu$++ \code{signal}				& 322		& 322.57	& 2.77  \\
+\CFA \code{signal}, 1 \code{monitor}	& 1145	& 1163.64	& 27.52 \\
+\CFA \code{signal}, 2 \code{monitor}	& 1531	& 1550.75	& 32.77 \\
+\CFA \code{signal}, 4 \code{monitor}	& 2288.5	& 2326.86	& 54.73 \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Internal scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
+\label{tab:int-sched}
+\end{figure}
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
+\cline{2-4}
+\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
+\hline
+$\mu$++ \code{Accept}				& 349		& 339.32	& 3.14  \\
+\CFA \code{waitfor}, 1 \code{monitor}	& 1155.5	& 1142.04	& 25.23 \\
+\CFA \code{waitfor}, 2 \code{monitor}	& 1361	& 1376.75	& 28.81 \\
+\CFA \code{waitfor}, 4 \code{monitor}	& 1941.5	& 1957.07	& 34.7  \\
+\hline
+\end{tabular}
+\end{center}
+\caption{External scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
+\label{tab:ext-sched}
+\end{figure}
+
+\begin{figure}
+\begin{center}
+\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
+\cline{2-4}
+\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
+\hline
+Pthreads			& 26974.5	& 26977	& 124.12 \\
+\CFA Coroutines		& 5		& 5		& 0      \\
+\CFA Threads		& 1122.5	& 1109.86	& 36.54  \\
+$\mu$++ Coroutines	& 106		& 107.04	& 1.61   \\
+$\mu$++ Threads		& 525.5	& 533.04	& 11.14  \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Creation comparaison. All numbers are in nanoseconds(\si{\nano\second})}
+\label{tab:creation}
+\end{figure}
Index: doc/proposals/concurrency/text/together.tex
===================================================================
--- doc/proposals/concurrency/text/together.tex	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/text/together.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -36,5 +36,5 @@
 }
 \end{cfacode}
-One of the obvious complaints of the previous code snippet (other than its toy-like simplicity) is that it does not handle exit conditions and just goes on for ever. Luckily, the monitor semantics can also be used to clearly enforce a shutdown order in a concise manner :
+One of the obvious complaints of the previous code snippet (other than its toy-like simplicity) is that it does not handle exit conditions and just goes on forever. Luckily, the monitor semantics can also be used to clearly enforce a shutdown order in a concise manner :
 \begin{cfacode}
 // Visualization declaration
Index: doc/proposals/concurrency/thesis.tex
===================================================================
--- doc/proposals/concurrency/thesis.tex	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/thesis.tex	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -35,5 +35,8 @@
 \usepackage[pagewise]{lineno}
 \usepackage{fancyhdr}
+\usepackage{float}
 \renewcommand{\linenumberfont}{\scriptsize\sffamily}
+\usepackage{siunitx}
+\sisetup{ binary-units=true }
 \input{style}							% bespoke macros used in the document
 \usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
@@ -107,4 +110,6 @@
 \input{together}
 
+\input{results}
+
 \input{future}
 
Index: doc/proposals/concurrency/version
===================================================================
--- doc/proposals/concurrency/version	(revision fb31cb899c2ed0144f33897963468ed96bf3edf3)
+++ doc/proposals/concurrency/version	(revision 64b272aedd6db65f570acb3d386065b1438f00f8)
@@ -1,1 +1,1 @@
-0.10.212
+0.10.340