source: doc/proposals/concurrency/text/results.tex@ fe4840a

ADT aaron-thesis arm-eh ast-experimental cleanup-dtors deferred_resn demangler enum forall-pointer-decay jacob/cs343-translation jenkins-sandbox new-ast new-ast-unique-expr new-env no_list persistent-indexer pthread-emulation qualifiedEnum resolv-new with_gc
Last change on this file since fe4840a was 6090518, checked in by Thierry Delisle <tdelisle@…>, 8 years ago

Ran Antidoe 9 spell checker on my thesis

  • Property mode set to 100644
File size: 11.0 KB
Line 
1% ======================================================================
2% ======================================================================
3\chapter{Performance Results} \label{results}
4% ======================================================================
5% ======================================================================
6\section{Machine Setup}
7Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. All tests were made on this machine.
8\begin{table}[H]
9\begin{center}
10\begin{tabular}{| l | r | l | r |}
11\hline
12Architecture & x86\_64 & NUMA node(s) & 8 \\
13\hline
14CPU op-mode(s) & 32-bit, 64-bit & Model name & AMD Opteron\texttrademark Processor 6380 \\
15\hline
16Byte Order & Little Endian & CPU Freq & 2.5\si{\giga\hertz} \\
17\hline
18CPU(s) & 64 & L1d cache & \SI{16}{\kibi\byte} \\
19\hline
20Thread(s) per core & 2 & L1i cache & \SI{64}{\kibi\byte} \\
21\hline
22Core(s) per socket & 8 & L2 cache & \SI{2048}{\kibi\byte} \\
23\hline
24Socket(s) & 4 & L3 cache & \SI{6144}{\kibi\byte} \\
25\hline
26\hline
27Operating system & Ubuntu 16.04.3 LTS & Kernel & Linux 4.4-97-generic \\
28\hline
29Compiler & GCC 6.3 & Translator & CFA 1 \\
30\hline
31Java version & OpenJDK-9 & Go version & 1.9.2 \\
32\hline
33\end{tabular}
34\end{center}
35\caption{Machine setup used for the tests}
36\label{tab:machine}
37\end{table}
38
39\section{Micro Benchmarks}
40All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code :
41\begin{pseudo}
42#define BENCH(run, result)
43 before = gettime();
44 run;
45 after = gettime();
46 result = (after - before) / N;
47\end{pseudo}
48The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many iterations of a simple call to measure the cost of the call. The specific number of iterations depends on the specific benchmark.
49
50\subsection{Context-Switching}
51The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch (\gls{uthread} to \gls{kthread} then \gls{kthread} to \gls{uthread}), which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads with the results in table \ref{tab:ctx-switch}. All omitted tests are functionally identical to one of these tests.
52\begin{figure}
53\begin{multicols}{2}
54\CFA Coroutines
55\begin{cfacode}
56coroutine GreatSuspender {};
57void main(GreatSuspender& this) {
58 while(true) { suspend(); }
59}
60int main() {
61 GreatSuspender s;
62 resume(s);
63 BENCH(
64 for(size_t i=0; i<n; i++) {
65 resume(s);
66 },
67 result
68 )
69 printf("%llu\n", result);
70}
71\end{cfacode}
72\columnbreak
73\CFA Threads
74\begin{cfacode}
75
76
77
78
79int main() {
80
81
82 BENCH(
83 for(size_t i=0; i<n; i++) {
84 yield();
85 },
86 result
87 )
88 printf("%llu\n", result);
89}
90\end{cfacode}
91\end{multicols}
92\begin{cfacode}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={lst:ctx-switch}]
93\end{cfacode}
94\end{figure}
95
96\begin{table}
97\begin{center}
98\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
99\cline{2-4}
100\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
101\hline
102Kernel Thread & 241.5 & 243.86 & 5.08 \\
103\CFA Coroutine & 38 & 38 & 0 \\
104\CFA Thread & 103 & 102.96 & 2.96 \\
105\uC Coroutine & 46 & 45.86 & 0.35 \\
106\uC Thread & 98 & 99.11 & 1.42 \\
107Goroutine & 150 & 149.96 & 3.16 \\
108Java Thread & 289 & 290.68 & 8.72 \\
109\hline
110\end{tabular}
111\end{center}
112\caption{Context Switch comparison. All numbers are in nanoseconds(\si{\nano\second})}
113\label{tab:ctx-switch}
114\end{table}
115
116\subsection{Mutual-Exclusion}
117The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock is also measured. The results can be shown in table \ref{tab:mutex}.
118
119\begin{figure}
120\begin{cfacode}[caption={\CFA benchmark code used to measure mutex routines.},label={lst:mutex}]
121monitor M {};
122void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
123
124int main() {
125 M m/*, m2, m3, m4*/;
126 BENCH(
127 for(size_t i=0; i<n; i++) {
128 call(m/*, m2, m3, m4*/);
129 },
130 result
131 )
132 printf("%llu\n", result);
133}
134\end{cfacode}
135\end{figure}
136
137\begin{table}
138\begin{center}
139\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
140\cline{2-4}
141\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
142\hline
143C routine & 2 & 2 & 0 \\
144FetchAdd + FetchSub & 26 & 26 & 0 \\
145Pthreads Mutex Lock & 31 & 31.86 & 0.99 \\
146\uC \code{monitor} member routine & 30 & 30 & 0 \\
147\CFA \code{mutex} routine, 1 argument & 41 & 41.57 & 0.9 \\
148\CFA \code{mutex} routine, 2 argument & 76 & 76.96 & 1.57 \\
149\CFA \code{mutex} routine, 4 argument & 145 & 146.68 & 3.85 \\
150Java synchronized routine & 27 & 28.57 & 2.6 \\
151\hline
152\end{tabular}
153\end{center}
154\caption{Mutex routine comparison. All numbers are in nanoseconds(\si{\nano\second})}
155\label{tab:mutex}
156\end{table}
157
158\subsection{Internal Scheduling}
159The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
160
161\begin{figure}
162\begin{cfacode}[caption={Benchmark code for internal scheduling},label={lst:int-sched}]
163volatile int go = 0;
164condition c;
165monitor M {};
166M m1;
167
168void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }
169
170thread T {};
171void ^?{}( T & mutex this ) {}
172void main( T & this ) {
173 while(go == 0) { yield(); }
174 while(go == 1) { do_call(m1); }
175}
176int __attribute__((noinline)) do_wait( M & mutex a1 ) {
177 go = 1;
178 BENCH(
179 for(size_t i=0; i<n; i++) {
180 wait(c);
181 },
182 result
183 )
184 printf("%llu\n", result);
185 go = 0;
186 return 0;
187}
188int main() {
189 T t;
190 return do_wait(m1);
191}
192\end{cfacode}
193\end{figure}
194
195\begin{table}
196\begin{center}
197\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
198\cline{2-4}
199\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
200\hline
201\uC \code{signal} & 322 & 323 & 3.36 \\
202\CFA \code{signal}, 1 \code{monitor} & 352.5 & 353.11 & 3.66 \\
203\CFA \code{signal}, 2 \code{monitor} & 430 & 430.29 & 8.97 \\
204\CFA \code{signal}, 4 \code{monitor} & 594.5 & 606.57 & 18.33 \\
205Java \code{notify} & 13831.5 & 15698.21 & 4782.3 \\
206\hline
207\end{tabular}
208\end{center}
209\caption{Internal scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
210\label{tab:int-sched}
211\end{table}
212
213\subsection{External Scheduling}
214The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
215
216\begin{figure}
217\begin{cfacode}[caption={Benchmark code for external scheduling},label={lst:ext-sched}]
218volatile int go = 0;
219monitor M {};
220M m1;
221thread T {};
222
223void __attribute__((noinline)) do_call( M & mutex a1 ) {}
224
225void ^?{}( T & mutex this ) {}
226void main( T & this ) {
227 while(go == 0) { yield(); }
228 while(go == 1) { do_call(m1); }
229}
230int __attribute__((noinline)) do_wait( M & mutex a1 ) {
231 go = 1;
232 BENCH(
233 for(size_t i=0; i<n; i++) {
234 waitfor(call, a1);
235 },
236 result
237 )
238 printf("%llu\n", result);
239 go = 0;
240 return 0;
241}
242int main() {
243 T t;
244 return do_wait(m1);
245}
246\end{cfacode}
247\end{figure}
248
249\begin{table}
250\begin{center}
251\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
252\cline{2-4}
253\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
254\hline
255\uC \code{Accept} & 350 & 350.61 & 3.11 \\
256\CFA \code{waitfor}, 1 \code{monitor} & 358.5 & 358.36 & 3.82 \\
257\CFA \code{waitfor}, 2 \code{monitor} & 422 & 426.79 & 7.95 \\
258\CFA \code{waitfor}, 4 \code{monitor} & 579.5 & 585.46 & 11.25 \\
259\hline
260\end{tabular}
261\end{center}
262\caption{External scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
263\label{tab:ext-sched}
264\end{table}
265
266\subsection{Object Creation}
267Finally, the last benchmark measures the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads, with results shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
268
269\begin{figure}
270\begin{center}
271pthread
272\begin{ccode}
273int main() {
274 BENCH(
275 for(size_t i=0; i<n; i++) {
276 pthread_t thread;
277 if(pthread_create(&thread,NULL,foo,NULL)<0) {
278 perror( "failure" );
279 return 1;
280 }
281
282 if(pthread_join(thread, NULL)<0) {
283 perror( "failure" );
284 return 1;
285 }
286 },
287 result
288 )
289 printf("%llu\n", result);
290}
291\end{ccode}
292
293
294
295\CFA Threads
296\begin{cfacode}
297int main() {
298 BENCH(
299 for(size_t i=0; i<n; i++) {
300 MyThread m;
301 },
302 result
303 )
304 printf("%llu\n", result);
305}
306\end{cfacode}
307\end{center}
308\begin{cfacode}[caption={Benchmark code for pthreads and \CFA to measure object creation},label={lst:creation}]
309\end{cfacode}
310\end{figure}
311
312\begin{table}
313\begin{center}
314\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
315\cline{2-4}
316\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
317\hline
318Pthreads & 26996 & 26984.71 & 156.6 \\
319\CFA Coroutine Lazy & 6 & 5.71 & 0.45 \\
320\CFA Coroutine Eager & 708 & 706.68 & 4.82 \\
321\CFA Thread & 1173.5 & 1176.18 & 15.18 \\
322\uC Coroutine & 109 & 107.46 & 1.74 \\
323\uC Thread & 526 & 530.89 & 9.73 \\
324Goroutine & 2520.5 & 2530.93 & 61,56 \\
325Java Thread & 91114.5 & 92272.79 & 961.58 \\
326\hline
327\end{tabular}
328\end{center}
329\caption{Creation comparison. All numbers are in nanoseconds(\si{\nano\second}).}
330\label{tab:creation}
331\end{table}
Note: See TracBrowser for help on using the repository browser.