% ======================================================================
% ======================================================================
\chapter{Performance results} \label{results}
% ======================================================================
% ======================================================================
\section{Machine setup}
Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. All tests where made on this machine.
\begin{table}[H]
\begin{center}
\begin{tabular}{| l | r | l | r |}
\hline
Architecture		& x86\_64 			& NUMA node(s) 	& 8 \\
\hline
CPU op-mode(s)		& 32-bit, 64-bit 		& Model name 	& AMD Opteron\texttrademark  Processor 6380 \\
\hline
Byte Order			& Little Endian 		& CPU Freq 		& 2.5\si{\giga\hertz} \\
\hline
CPU(s)			& 64 				& L1d cache 	& \SI{16}{\kibi\byte} \\
\hline
Thread(s) per core	& 2 				& L1i cache 	& \SI{64}{\kibi\byte} \\
\hline
Core(s) per socket	& 8 				& L2 cache 		& \SI{2048}{\kibi\byte} \\
\hline
Socket(s)			& 4 				& L3 cache 		& \SI{6144}{\kibi\byte} \\
\hline
\hline
Operating system		& Ubuntu 16.04.3 LTS	& Kernel		& Linux 4.4.0-97-generic \\
\hline
Compiler			& GCC 6.3.0 		& Translator	& CFA 1.0.0 \\
\hline
Java version		& OpenJDK-9 		& Go version	& 1.9.2 \\
\hline
\end{tabular}
\end{center}
\caption{Machine setup used for the tests}
\label{tab:machine}
\end{table}

\section{Micro benchmarks}
All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code :
\begin{pseudo}
#define BENCH(run, result)
	before = gettime();
	run;
	after  = gettime();
	result = (after - before) / N;
\end{pseudo}
The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many iterations of a simple call to measure the cost of the call. The specific number of iteration depends on the specific benchmark.

\subsection{Context-switching}
The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch (\gls{uthread} to \gls{kthread} then \gls{kthread} to \gls{uthread}), which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads whith the results in table \ref{tab:ctx-switch}. All omitted tests are functionally identical to one of these tests.
\begin{figure}
\begin{multicols}{2}
\CFA Coroutines
\begin{cfacode}
coroutine GreatSuspender {};
void main(GreatSuspender& this) {
	while(true) { suspend(); }
}
int main() {
	GreatSuspender s;
	resume(s);
	BENCH(
		for(size_t i=0; i<n; i++) {
			resume(s);
		},
		result
	)
	printf("%llu\n", result);
}
\end{cfacode}
\columnbreak
\CFA Threads
\begin{cfacode}


int main() {


	BENCH(
		for(size_t i=0; i<n; i++) {
			yield();
		},
		result
	)
	printf("%llu\n", result);
}
\end{cfacode}
\end{multicols}
\begin{cfacode}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={lst:ctx-switch}]
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
Kernel Thread	& 239		& 242.57	& 5.54 \\
\CFA Coroutine	& 38		& 38		& 0    \\
\CFA Thread		& 102		& 102.39	& 1.57 \\
\uC Coroutine	& 46		& 46.68	& 0.47 \\
\uC Thread		& 98		& 99.39	& 1.52 \\
Goroutine		& 148		& 148.0	& 0 \\
Java Thread		& 271		& 271.0	& 0 \\
\hline
\end{tabular}
\end{center}
\caption{Context Switch comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:ctx-switch}
\end{table}

\subsection{Mutual-exclusion}
The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also measured. The results can be shown in table \ref{tab:mutex}.

\begin{figure}
\begin{cfacode}[caption={\CFA benchmark code used to measure mutex routines.},label={lst:mutex}]
monitor M {};
void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}

int main() {
	M m/*, m2, m3, m4*/;
	BENCH(
		for(size_t i=0; i<n; i++) {
			call(m/*, m2, m3, m4*/);
		},
		result
	)
	printf("%llu\n", result);
}
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
C routine						& 2		& 2		& 0      \\
FetchAdd + FetchSub				& 2		& 2		& 0      \\
Pthreads Mutex Lock				& 31		& 31.86	& 0.99   \\
\uC \code{monitor} member routine		& 30		& 30		& 0      \\
\CFA \code{mutex} routine, 1 argument	& 46		& 46.14	& 0.74   \\
\CFA \code{mutex} routine, 2 argument	& 82		& 83		& 1.93   \\
\CFA \code{mutex} routine, 4 argument	& 165		& 161.15	& 54.04  \\
Java synchronized routine			& 165		& 161.15	& 54.04  \\
\hline
\end{tabular}
\end{center}
\caption{Mutex routine comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:mutex}
\end{table}

\subsection{Internal scheduling}
The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.

\begin{figure}
\begin{cfacode}[caption={Benchmark code for internal scheduling},label={lst:int-sched}]
volatile int go = 0;
condition c;
monitor M {};
M m1;

void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }

thread T {};
void ^?{}( T & mutex this ) {}
void main( T & this ) {
	while(go == 0) { yield(); }
	while(go == 1) { do_call(m1); }
}
int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
	go = 1;
	BENCH(
		for(size_t i=0; i<n; i++) {
			wait(c);
		},
		result
	)
	printf("%llu\n", result);
	go = 0;
	return 0;
}
int main() {
	T t;
	return do_wait(m1);
}
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
\uC \code{signal}					& 322		& 322.57	& 2.77  \\
\CFA \code{signal}, 1 \code{monitor}	& 1145	& 1163.64	& 27.52 \\
\CFA \code{signal}, 2 \code{monitor}	& 1531	& 1550.75	& 32.77 \\
\CFA \code{signal}, 4 \code{monitor}	& 2288.5	& 2326.86	& 54.73 \\
Java \code{notify}				& 2288.5	& 2326.86	& 54.73 \\
\hline
\end{tabular}
\end{center}
\caption{Internal scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:int-sched}
\end{table}

\subsection{External scheduling}
The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.

\begin{figure}
\begin{cfacode}[caption={Benchmark code for external scheduling},label={lst:ext-sched}]
volatile int go = 0;
monitor M {};
M m1;
thread T {};

void __attribute__((noinline)) do_call( M & mutex a1 ) {}

void ^?{}( T & mutex this ) {}
void main( T & this ) {
	while(go == 0) { yield(); }
	while(go == 1) { do_call(m1); }
}
int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
	go = 1;
	BENCH(
		for(size_t i=0; i<n; i++) {
			waitfor(call, a1);
		},
		result
	)
	printf("%llu\n", result);
	go = 0;
	return 0;
}
int main() {
	T t;
	return do_wait(m1);
}
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
\uC \code{Accept}					& 349		& 339.32	& 3.14  \\
\CFA \code{waitfor}, 1 \code{monitor}	& 1155.5	& 1142.04	& 25.23 \\
\CFA \code{waitfor}, 2 \code{monitor}	& 1361	& 1376.75	& 28.81 \\
\CFA \code{waitfor}, 4 \code{monitor}	& 1941.5	& 1957.07	& 34.7  \\
\hline
\end{tabular}
\end{center}
\caption{External scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:ext-sched}
\end{table}

\subsection{Object creation}
Finally, the last benchmark measurs the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads, with results shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the call-stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.

\begin{figure}
\begin{center}
pthread
\begin{ccode}
int main() {
	BENCH(
		for(size_t i=0; i<n; i++) {
			pthread_t thread;
			if(pthread_create(&thread,NULL,foo,NULL)<0) {
				perror( "failure" );
				return 1;
			}

			if(pthread_join(thread, NULL)<0) {
				perror( "failure" );
				return 1;
			}
		},
		result
	)
	printf("%llu\n", result);
}
\end{ccode}


\CFA Threads
\begin{cfacode}
int main() {
	BENCH(
		for(size_t i=0; i<n; i++) {
			MyThread m;
		},
		result
	)
	printf("%llu\n", result);
}
\end{cfacode}
\end{center}
\begin{cfacode}[caption={Benchmark code for pthreads and \CFA to measure object creation},label={lst:creation}]
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
Pthreads			& 26974.5	& 26977	& 124.12 \\
\CFA Coroutine Lazy	& 5		& 5		& 0      \\
\CFA Coroutine Eager	& 335.0	& 357.67	& 34.2   \\
\CFA Thread			& 1122.5	& 1109.86	& 36.54  \\
\uC Coroutine		& 106		& 107.04	& 1.61   \\
\uC Thread			& 525.5	& 533.04	& 11.14  \\
Goroutine			& 525.5	& 533.04	& 11.14  \\
Java Thread			& 525.5	& 533.04	& 11.14  \\
\hline
\end{tabular}
\end{center}
\caption{Creation comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:creation}
\end{table}