% ======================================================================
% ======================================================================
\chapter{Performance Results} \label{results}
% ======================================================================
% ======================================================================
\section{Machine Setup}
Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. All tests were made on this machine.
\begin{table}[H]
\begin{center}
\begin{tabular}{| l | r | l | r |}
\hline
Architecture		& x86\_64 			& NUMA node(s) 	& 8 \\
\hline
CPU op-mode(s)		& 32-bit, 64-bit 		& Model name 	& AMD Opteron\texttrademark  Processor 6380 \\
\hline
Byte Order			& Little Endian 		& CPU Freq 		& 2.5\si{\giga\hertz} \\
\hline
CPU(s)			& 64 				& L1d cache 	& \SI{16}{\kibi\byte} \\
\hline
Thread(s) per core	& 2 				& L1i cache 	& \SI{64}{\kibi\byte} \\
\hline
Core(s) per socket	& 8 				& L2 cache 		& \SI{2048}{\kibi\byte} \\
\hline
Socket(s)			& 4 				& L3 cache 		& \SI{6144}{\kibi\byte} \\
\hline
\hline
Operating system		& Ubuntu 16.04.3 LTS	& Kernel		& Linux 4.4-97-generic \\
\hline
Compiler			& GCC 6.3 		& Translator	& CFA 1 \\
\hline
Java version		& OpenJDK-9 		& Go version	& 1.9.2 \\
\hline
\end{tabular}
\end{center}
\caption{Machine setup used for the tests}
\label{tab:machine}
\end{table}

\section{Micro Benchmarks}
All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code:
\begin{pseudo}
#define BENCH(run, result) \
	before = gettime(); \
	run; \
	after  = gettime(); \
	result = (after - before) / N;
\end{pseudo}
The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many iterations of a simple call to measure the cost of the call. The specific number of iterations depends on the specific benchmark.

\subsection{Context-Switching}
The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. Yielding causes the thread to context-switch to the scheduler and back, more precisely: from the \gls{uthread} to the \gls{kthread} then from the \gls{kthread} back to the same \gls{uthread} (or a different one in the general case). In order to make the comparison fair, coroutines also execute a 2-step context-switch by resuming another coroutine which does nothing but suspending in a tight loop, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads with the results in table \ref{tab:ctx-switch}. All omitted tests are functionally identical to one of these tests. The difference between coroutines and threads can be attributed to the cost of scheduling.
\begin{figure}
\begin{multicols}{2}
\CFA Coroutines
\begin{cfacode}
coroutine GreatSuspender {};
void main(GreatSuspender& this) {
	while(true) { suspend(); }
}
int main() {
	GreatSuspender s;
	resume(s);
	BENCH(
		for(size_t i=0; i<n; i++) {
			resume(s);
		},
		result
	)
	printf("%llu\n", result);
}
\end{cfacode}
\columnbreak
\CFA Threads
\begin{cfacode}




int main() {


	BENCH(
		for(size_t i=0; i<n; i++) {
			yield();
		},
		result
	)
	printf("%llu\n", result);
}
\end{cfacode}
\end{multicols}
\begin{cfacode}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={lst:ctx-switch}]
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
Kernel Thread	& 241.5	& 243.86	& 5.08 \\
\CFA Coroutine	& 38		& 38		& 0    \\
\CFA Thread		& 103		& 102.96	& 2.96 \\
\uC Coroutine	& 46		& 45.86	& 0.35 \\
\uC Thread		& 98		& 99.11	& 1.42 \\
Goroutine		& 150		& 149.96	& 3.16 \\
Java Thread		& 289		& 290.68	& 8.72 \\
\hline
\end{tabular}
\end{center}
\caption{Context Switch comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:ctx-switch}
\end{table}

\subsection{Mutual-Exclusion}
The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a \code{pthread_mutex} lock is also measured. The results can be shown in table \ref{tab:mutex}.

\begin{figure}
\begin{cfacode}[caption={\CFA benchmark code used to measure mutex routines.},label={lst:mutex}]
monitor M {};
void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}

int main() {
	M m/*, m2, m3, m4*/;
	BENCH(
		for(size_t i=0; i<n; i++) {
			call(m/*, m2, m3, m4*/);
		},
		result
	)
	printf("%llu\n", result);
}
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
C routine						& 2		& 2		& 0    \\
FetchAdd + FetchSub				& 26		& 26		& 0    \\
Pthreads Mutex Lock				& 31		& 31.86	& 0.99 \\
\uC \code{monitor} member routine		& 30		& 30		& 0    \\
\CFA \code{mutex} routine, 1 argument	& 41		& 41.57	& 0.9  \\
\CFA \code{mutex} routine, 2 argument	& 76		& 76.96	& 1.57 \\
\CFA \code{mutex} routine, 4 argument	& 145		& 146.68	& 3.85 \\
Java synchronized routine			& 27		& 28.57	& 2.6  \\
\hline
\end{tabular}
\end{center}
\caption{Mutex routine comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:mutex}
\end{table}

\subsection{Internal Scheduling}
The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.

\begin{figure}
\begin{cfacode}[caption={Benchmark code for internal scheduling},label={lst:int-sched}]
volatile int go = 0;
condition c;
monitor M {};
M m1;

void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }

thread T {};
void ^?{}( T & mutex this ) {}
void main( T & this ) {
	while(go == 0) { yield(); }
	while(go == 1) { do_call(m1); }
}
int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
	go = 1;
	BENCH(
		for(size_t i=0; i<n; i++) {
			wait(c);
		},
		result
	)
	printf("%llu\n", result);
	go = 0;
	return 0;
}
int main() {
	T t;
	return do_wait(m1);
}
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
Pthreads Condition Variable			& 5902.5	& 6093.29 	& 714.78 \\
\uC \code{signal}					& 322		& 323 	& 3.36   \\
\CFA \code{signal}, 1 \code{monitor}	& 352.5	& 353.11	& 3.66   \\
\CFA \code{signal}, 2 \code{monitor}	& 430		& 430.29	& 8.97   \\
\CFA \code{signal}, 4 \code{monitor}	& 594.5	& 606.57	& 18.33  \\
Java \code{notify}				& 13831.5	& 15698.21	& 4782.3 \\
\hline
\end{tabular}
\end{center}
\caption{Internal scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:int-sched}
\end{table}

\subsection{External Scheduling}
The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.

\begin{figure}
\begin{cfacode}[caption={Benchmark code for external scheduling},label={lst:ext-sched}]
volatile int go = 0;
monitor M {};
M m1;
thread T {};

void __attribute__((noinline)) do_call( M & mutex a1 ) {}

void ^?{}( T & mutex this ) {}
void main( T & this ) {
	while(go == 0) { yield(); }
	while(go == 1) { do_call(m1); }
}
int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
	go = 1;
	BENCH(
		for(size_t i=0; i<n; i++) {
			waitfor(call, a1);
		},
		result
	)
	printf("%llu\n", result);
	go = 0;
	return 0;
}
int main() {
	T t;
	return do_wait(m1);
}
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
\uC \code{Accept}					& 350		& 350.61	& 3.11  \\
\CFA \code{waitfor}, 1 \code{monitor}	& 358.5	& 358.36	& 3.82  \\
\CFA \code{waitfor}, 2 \code{monitor}	& 422		& 426.79	& 7.95  \\
\CFA \code{waitfor}, 4 \code{monitor}	& 579.5	& 585.46	& 11.25 \\
\hline
\end{tabular}
\end{center}
\caption{External scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
\label{tab:ext-sched}
\end{table}

\subsection{Object Creation}
Finally, the last benchmark measures the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for \texttt{pthread}s and \CFA threads, with results shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.

\begin{figure}
\begin{center}
\texttt{pthread}
\begin{ccode}
int main() {
	BENCH(
		for(size_t i=0; i<n; i++) {
			pthread_t thread;
			if(pthread_create(&thread,NULL,foo,NULL)<0) {
				perror( "failure" );
				return 1;
			}

			if(pthread_join(thread, NULL)<0) {
				perror( "failure" );
				return 1;
			}
		},
		result
	)
	printf("%llu\n", result);
}
\end{ccode}



\CFA Threads
\begin{cfacode}
int main() {
	BENCH(
		for(size_t i=0; i<n; i++) {
			MyThread m;
		},
		result
	)
	printf("%llu\n", result);
}
\end{cfacode}
\end{center}
\begin{cfacode}[caption={Benchmark code for \texttt{pthread}s and \CFA to measure object creation},label={lst:creation}]
\end{cfacode}
\end{figure}

\begin{table}
\begin{center}
\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
\cline{2-4}
\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
\hline
Pthreads			& 26996	& 26984.71	& 156.6  \\
\CFA Coroutine Lazy	& 6		& 5.71	& 0.45   \\
\CFA Coroutine Eager	& 708		& 706.68	& 4.82   \\
\CFA Thread			& 1173.5	& 1176.18	& 15.18  \\
\uC Coroutine		& 109		& 107.46	& 1.74   \\
\uC Thread			& 526		& 530.89	& 9.73   \\
Goroutine			& 2520.5	& 2530.93	& 61,56  \\
Java Thread			& 91114.5	& 92272.79	& 961.58 \\
\hline
\end{tabular}
\end{center}
\caption{Creation comparison. All numbers are in nanoseconds(\si{\nano\second}).}
\label{tab:creation}
\end{table}
