source: doc/proposals/concurrency/text/results.tex @ 9d48a17

ADTaaron-thesisarm-ehast-experimentalcleanup-dtorsdeferred_resndemanglerenumforall-pointer-decayjacob/cs343-translationjenkins-sandboxnew-astnew-ast-unique-exprnew-envno_listpersistent-indexerpthread-emulationqualifiedEnumresolv-newwith_gc
Last change on this file since 9d48a17 was 383e159, checked in by Thierry Delisle <tdelisle@…>, 6 years ago

Updated thesis results

  • Property mode set to 100644
File size: 11.0 KB
Line 
1% ======================================================================
2% ======================================================================
3\chapter{Performance results} \label{results}
4% ======================================================================
5% ======================================================================
6\section{Machine setup}
7Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. All tests where made on this machine.
8\begin{table}[H]
9\begin{center}
10\begin{tabular}{| l | r | l | r |}
11\hline
12Architecture            & x86\_64                       & NUMA node(s)  & 8 \\
13\hline
14CPU op-mode(s)          & 32-bit, 64-bit                & Model name    & AMD Opteron\texttrademark  Processor 6380 \\
15\hline
16Byte Order                      & Little Endian                 & CPU Freq              & 2.5\si{\giga\hertz} \\
17\hline
18CPU(s)                  & 64                            & L1d cache     & \SI{16}{\kibi\byte} \\
19\hline
20Thread(s) per core      & 2                             & L1i cache     & \SI{64}{\kibi\byte} \\
21\hline
22Core(s) per socket      & 8                             & L2 cache              & \SI{2048}{\kibi\byte} \\
23\hline
24Socket(s)                       & 4                             & L3 cache              & \SI{6144}{\kibi\byte} \\
25\hline
26\hline
27Operating system                & Ubuntu 16.04.3 LTS    & Kernel                & Linux 4.4-97-generic \\
28\hline
29Compiler                        & GCC 6.3               & Translator    & CFA 1 \\
30\hline
31Java version            & OpenJDK-9             & Go version    & 1.9.2 \\
32\hline
33\end{tabular}
34\end{center}
35\caption{Machine setup used for the tests}
36\label{tab:machine}
37\end{table}
38
39\section{Micro benchmarks}
40All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code :
41\begin{pseudo}
42#define BENCH(run, result)
43        before = gettime();
44        run;
45        after  = gettime();
46        result = (after - before) / N;
47\end{pseudo}
48The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many iterations of a simple call to measure the cost of the call. The specific number of iteration depends on the specific benchmark.
49
50\subsection{Context-switching}
51The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch (\gls{uthread} to \gls{kthread} then \gls{kthread} to \gls{uthread}), which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads whith the results in table \ref{tab:ctx-switch}. All omitted tests are functionally identical to one of these tests.
52\begin{figure}
53\begin{multicols}{2}
54\CFA Coroutines
55\begin{cfacode}
56coroutine GreatSuspender {};
57void main(GreatSuspender& this) {
58        while(true) { suspend(); }
59}
60int main() {
61        GreatSuspender s;
62        resume(s);
63        BENCH(
64                for(size_t i=0; i<n; i++) {
65                        resume(s);
66                },
67                result
68        )
69        printf("%llu\n", result);
70}
71\end{cfacode}
72\columnbreak
73\CFA Threads
74\begin{cfacode}
75
76
77
78
79int main() {
80
81
82        BENCH(
83                for(size_t i=0; i<n; i++) {
84                        yield();
85                },
86                result
87        )
88        printf("%llu\n", result);
89}
90\end{cfacode}
91\end{multicols}
92\begin{cfacode}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={lst:ctx-switch}]
93\end{cfacode}
94\end{figure}
95
96\begin{table}
97\begin{center}
98\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
99\cline{2-4}
100\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
101\hline
102Kernel Thread   & 241.5 & 243.86        & 5.08 \\
103\CFA Coroutine  & 38            & 38            & 0    \\
104\CFA Thread             & 103           & 102.96        & 2.96 \\
105\uC Coroutine   & 46            & 45.86 & 0.35 \\
106\uC Thread              & 98            & 99.11 & 1.42 \\
107Goroutine               & 150           & 149.96        & 3.16 \\
108Java Thread             & 289           & 290.68        & 8.72 \\
109\hline
110\end{tabular}
111\end{center}
112\caption{Context Switch comparison. All numbers are in nanoseconds(\si{\nano\second})}
113\label{tab:ctx-switch}
114\end{table}
115
116\subsection{Mutual-exclusion}
117The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also measured. The results can be shown in table \ref{tab:mutex}.
118
119\begin{figure}
120\begin{cfacode}[caption={\CFA benchmark code used to measure mutex routines.},label={lst:mutex}]
121monitor M {};
122void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
123
124int main() {
125        M m/*, m2, m3, m4*/;
126        BENCH(
127                for(size_t i=0; i<n; i++) {
128                        call(m/*, m2, m3, m4*/);
129                },
130                result
131        )
132        printf("%llu\n", result);
133}
134\end{cfacode}
135\end{figure}
136
137\begin{table}
138\begin{center}
139\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
140\cline{2-4}
141\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
142\hline
143C routine                                               & 2             & 2             & 0    \\
144FetchAdd + FetchSub                             & 26            & 26            & 0    \\
145Pthreads Mutex Lock                             & 31            & 31.86 & 0.99 \\
146\uC \code{monitor} member routine               & 30            & 30            & 0    \\
147\CFA \code{mutex} routine, 1 argument   & 41            & 41.57 & 0.9  \\
148\CFA \code{mutex} routine, 2 argument   & 76            & 76.96 & 1.57 \\
149\CFA \code{mutex} routine, 4 argument   & 145           & 146.68        & 3.85 \\
150Java synchronized routine                       & 27            & 28.57 & 2.6  \\
151\hline
152\end{tabular}
153\end{center}
154\caption{Mutex routine comparison. All numbers are in nanoseconds(\si{\nano\second})}
155\label{tab:mutex}
156\end{table}
157
158\subsection{Internal scheduling}
159The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
160
161\begin{figure}
162\begin{cfacode}[caption={Benchmark code for internal scheduling},label={lst:int-sched}]
163volatile int go = 0;
164condition c;
165monitor M {};
166M m1;
167
168void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }
169
170thread T {};
171void ^?{}( T & mutex this ) {}
172void main( T & this ) {
173        while(go == 0) { yield(); }
174        while(go == 1) { do_call(m1); }
175}
176int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
177        go = 1;
178        BENCH(
179                for(size_t i=0; i<n; i++) {
180                        wait(c);
181                },
182                result
183        )
184        printf("%llu\n", result);
185        go = 0;
186        return 0;
187}
188int main() {
189        T t;
190        return do_wait(m1);
191}
192\end{cfacode}
193\end{figure}
194
195\begin{table}
196\begin{center}
197\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
198\cline{2-4}
199\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
200\hline
201\uC \code{signal}                                       & 322           & 323   & 3.36   \\
202\CFA \code{signal}, 1 \code{monitor}    & 352.5 & 353.11        & 3.66   \\
203\CFA \code{signal}, 2 \code{monitor}    & 430           & 430.29        & 8.97   \\
204\CFA \code{signal}, 4 \code{monitor}    & 594.5 & 606.57        & 18.33  \\
205Java \code{notify}                              & 13831.5       & 15698.21      & 4782.3 \\
206\hline
207\end{tabular}
208\end{center}
209\caption{Internal scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
210\label{tab:int-sched}
211\end{table}
212
213\subsection{External scheduling}
214The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
215
216\begin{figure}
217\begin{cfacode}[caption={Benchmark code for external scheduling},label={lst:ext-sched}]
218volatile int go = 0;
219monitor M {};
220M m1;
221thread T {};
222
223void __attribute__((noinline)) do_call( M & mutex a1 ) {}
224
225void ^?{}( T & mutex this ) {}
226void main( T & this ) {
227        while(go == 0) { yield(); }
228        while(go == 1) { do_call(m1); }
229}
230int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
231        go = 1;
232        BENCH(
233                for(size_t i=0; i<n; i++) {
234                        waitfor(call, a1);
235                },
236                result
237        )
238        printf("%llu\n", result);
239        go = 0;
240        return 0;
241}
242int main() {
243        T t;
244        return do_wait(m1);
245}
246\end{cfacode}
247\end{figure}
248
249\begin{table}
250\begin{center}
251\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
252\cline{2-4}
253\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
254\hline
255\uC \code{Accept}                                       & 350           & 350.61        & 3.11  \\
256\CFA \code{waitfor}, 1 \code{monitor}   & 358.5 & 358.36        & 3.82  \\
257\CFA \code{waitfor}, 2 \code{monitor}   & 422           & 426.79        & 7.95  \\
258\CFA \code{waitfor}, 4 \code{monitor}   & 579.5 & 585.46        & 11.25 \\
259\hline
260\end{tabular}
261\end{center}
262\caption{External scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
263\label{tab:ext-sched}
264\end{table}
265
266\subsection{Object creation}
267Finally, the last benchmark measurs the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads, with results shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the call-stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
268
269\begin{figure}
270\begin{center}
271pthread
272\begin{ccode}
273int main() {
274        BENCH(
275                for(size_t i=0; i<n; i++) {
276                        pthread_t thread;
277                        if(pthread_create(&thread,NULL,foo,NULL)<0) {
278                                perror( "failure" );
279                                return 1;
280                        }
281
282                        if(pthread_join(thread, NULL)<0) {
283                                perror( "failure" );
284                                return 1;
285                        }
286                },
287                result
288        )
289        printf("%llu\n", result);
290}
291\end{ccode}
292
293
294
295\CFA Threads
296\begin{cfacode}
297int main() {
298        BENCH(
299                for(size_t i=0; i<n; i++) {
300                        MyThread m;
301                },
302                result
303        )
304        printf("%llu\n", result);
305}
306\end{cfacode}
307\end{center}
308\begin{cfacode}[caption={Benchmark code for pthreads and \CFA to measure object creation},label={lst:creation}]
309\end{cfacode}
310\end{figure}
311
312\begin{table}
313\begin{center}
314\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
315\cline{2-4}
316\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
317\hline
318Pthreads                        & 26996 & 26984.71      & 156.6  \\
319\CFA Coroutine Lazy     & 6             & 5.71  & 0.45   \\
320\CFA Coroutine Eager    & 708           & 706.68        & 4.82   \\
321\CFA Thread                     & 1173.5        & 1176.18       & 15.18  \\
322\uC Coroutine           & 109           & 107.46        & 1.74   \\
323\uC Thread                      & 526           & 530.89        & 9.73   \\
324Goroutine                       & 2520.5        & 2530.93       & 61,56  \\
325Java Thread                     & 91114.5       & 92272.79      & 961.58 \\
326\hline
327\end{tabular}
328\end{center}
329\caption{Creation comparison. All numbers are in nanoseconds(\si{\nano\second})}
330\label{tab:creation}
331\end{table}
Note: See TracBrowser for help on using the repository browser.