source: doc/theses/thierry_delisle_MMath/text/results.tex @ 2185df1

ADTaaron-thesisarm-ehast-experimentalcleanup-dtorsdeferred_resnenumforall-pointer-decayjacob/cs343-translationjenkins-sandboxnew-astnew-ast-unique-exprno_listpersistent-indexerpthread-emulationqualifiedEnum
Last change on this file since 2185df1 was 67982887, checked in by Peter A. Buhr <pabuhr@…>, 6 years ago

specialize thesis directory-names

  • Property mode set to 100644
File size: 11.5 KB
RevLine 
[64b272a]1% ======================================================================
2% ======================================================================
[6090518]3\chapter{Performance Results} \label{results}
[64b272a]4% ======================================================================
5% ======================================================================
[6090518]6\section{Machine Setup}
7Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. All tests were made on this machine.
[cf966b5]8\begin{table}[H]
[64b272a]9\begin{center}
10\begin{tabular}{| l | r | l | r |}
11\hline
12Architecture            & x86\_64                       & NUMA node(s)  & 8 \\
13\hline
14CPU op-mode(s)          & 32-bit, 64-bit                & Model name    & AMD Opteron\texttrademark  Processor 6380 \\
15\hline
16Byte Order                      & Little Endian                 & CPU Freq              & 2.5\si{\giga\hertz} \\
17\hline
18CPU(s)                  & 64                            & L1d cache     & \SI{16}{\kibi\byte} \\
19\hline
20Thread(s) per core      & 2                             & L1i cache     & \SI{64}{\kibi\byte} \\
21\hline
22Core(s) per socket      & 8                             & L2 cache              & \SI{2048}{\kibi\byte} \\
23\hline
24Socket(s)                       & 4                             & L3 cache              & \SI{6144}{\kibi\byte} \\
25\hline
26\hline
[383e159]27Operating system                & Ubuntu 16.04.3 LTS    & Kernel                & Linux 4.4-97-generic \\
[64b272a]28\hline
[383e159]29Compiler                        & GCC 6.3               & Translator    & CFA 1 \\
[64b272a]30\hline
[cf966b5]31Java version            & OpenJDK-9             & Go version    & 1.9.2 \\
32\hline
[64b272a]33\end{tabular}
34\end{center}
35\caption{Machine setup used for the tests}
36\label{tab:machine}
[cf966b5]37\end{table}
[64b272a]38
[6090518]39\section{Micro Benchmarks}
[cae28da]40All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code:
[20ffcf3]41\begin{pseudo}
[cae28da]42#define BENCH(run, result) \
43        before = gettime(); \
44        run; \
45        after  = gettime(); \
[20ffcf3]46        result = (after - before) / N;
47\end{pseudo}
[6090518]48The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many iterations of a simple call to measure the cost of the call. The specific number of iterations depends on the specific benchmark.
[20ffcf3]49
[6090518]50\subsection{Context-Switching}
[cae28da]51The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. Yielding causes the thread to context-switch to the scheduler and back, more precisely: from the \gls{uthread} to the \gls{kthread} then from the \gls{kthread} back to the same \gls{uthread} (or a different one in the general case). In order to make the comparison fair, coroutines also execute a 2-step context-switch by resuming another coroutine which does nothing but suspending in a tight loop, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads with the results in table \ref{tab:ctx-switch}. All omitted tests are functionally identical to one of these tests. The difference between coroutines and threads can be attributed to the cost of scheduling.
[20ffcf3]52\begin{figure}
53\begin{multicols}{2}
54\CFA Coroutines
55\begin{cfacode}
56coroutine GreatSuspender {};
57void main(GreatSuspender& this) {
58        while(true) { suspend(); }
59}
60int main() {
61        GreatSuspender s;
62        resume(s);
63        BENCH(
64                for(size_t i=0; i<n; i++) {
65                        resume(s);
66                },
67                result
68        )
69        printf("%llu\n", result);
70}
71\end{cfacode}
72\columnbreak
73\CFA Threads
74\begin{cfacode}
75
76
77
78
79int main() {
80
81
82        BENCH(
83                for(size_t i=0; i<n; i++) {
84                        yield();
85                },
86                result
87        )
88        printf("%llu\n", result);
89}
90\end{cfacode}
91\end{multicols}
[cf966b5]92\begin{cfacode}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={lst:ctx-switch}]
93\end{cfacode}
[20ffcf3]94\end{figure}
[64b272a]95
[cf966b5]96\begin{table}
[64b272a]97\begin{center}
98\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
99\cline{2-4}
100\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
101\hline
[383e159]102Kernel Thread   & 241.5 & 243.86        & 5.08 \\
[cf966b5]103\CFA Coroutine  & 38            & 38            & 0    \\
[383e159]104\CFA Thread             & 103           & 102.96        & 2.96 \\
105\uC Coroutine   & 46            & 45.86 & 0.35 \\
106\uC Thread              & 98            & 99.11 & 1.42 \\
107Goroutine               & 150           & 149.96        & 3.16 \\
108Java Thread             & 289           & 290.68        & 8.72 \\
[64b272a]109\hline
110\end{tabular}
111\end{center}
[07c1e595]112\caption{Context Switch comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]113\label{tab:ctx-switch}
[cf966b5]114\end{table}
[64b272a]115
[6090518]116\subsection{Mutual-Exclusion}
[cae28da]117The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a \code{pthread_mutex} lock is also measured. The results can be shown in table \ref{tab:mutex}.
[20ffcf3]118
119\begin{figure}
[cf966b5]120\begin{cfacode}[caption={\CFA benchmark code used to measure mutex routines.},label={lst:mutex}]
[20ffcf3]121monitor M {};
122void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
123
124int main() {
125        M m/*, m2, m3, m4*/;
126        BENCH(
127                for(size_t i=0; i<n; i++) {
128                        call(m/*, m2, m3, m4*/);
129                },
130                result
131        )
132        printf("%llu\n", result);
133}
134\end{cfacode}
135\end{figure}
136
[cf966b5]137\begin{table}
[64b272a]138\begin{center}
139\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
140\cline{2-4}
141\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
142\hline
[383e159]143C routine                                               & 2             & 2             & 0    \\
144FetchAdd + FetchSub                             & 26            & 26            & 0    \\
145Pthreads Mutex Lock                             & 31            & 31.86 & 0.99 \\
146\uC \code{monitor} member routine               & 30            & 30            & 0    \\
147\CFA \code{mutex} routine, 1 argument   & 41            & 41.57 & 0.9  \\
148\CFA \code{mutex} routine, 2 argument   & 76            & 76.96 & 1.57 \\
149\CFA \code{mutex} routine, 4 argument   & 145           & 146.68        & 3.85 \\
150Java synchronized routine                       & 27            & 28.57 & 2.6  \\
[64b272a]151\hline
152\end{tabular}
153\end{center}
[07c1e595]154\caption{Mutex routine comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]155\label{tab:mutex}
[cf966b5]156\end{table}
[64b272a]157
[6090518]158\subsection{Internal Scheduling}
[cf966b5]159The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
[20ffcf3]160
161\begin{figure}
[cf966b5]162\begin{cfacode}[caption={Benchmark code for internal scheduling},label={lst:int-sched}]
[20ffcf3]163volatile int go = 0;
164condition c;
165monitor M {};
166M m1;
167
168void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }
169
170thread T {};
171void ^?{}( T & mutex this ) {}
172void main( T & this ) {
173        while(go == 0) { yield(); }
174        while(go == 1) { do_call(m1); }
175}
176int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
177        go = 1;
178        BENCH(
179                for(size_t i=0; i<n; i++) {
180                        wait(c);
181                },
182                result
183        )
184        printf("%llu\n", result);
185        go = 0;
186        return 0;
187}
188int main() {
189        T t;
190        return do_wait(m1);
191}
192\end{cfacode}
193\end{figure}
194
[cf966b5]195\begin{table}
[64b272a]196\begin{center}
197\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
198\cline{2-4}
199\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
200\hline
[5c4f2c2]201Pthreads Condition Variable                     & 5902.5        & 6093.29       & 714.78 \\
[383e159]202\uC \code{signal}                                       & 322           & 323   & 3.36   \\
203\CFA \code{signal}, 1 \code{monitor}    & 352.5 & 353.11        & 3.66   \\
204\CFA \code{signal}, 2 \code{monitor}    & 430           & 430.29        & 8.97   \\
205\CFA \code{signal}, 4 \code{monitor}    & 594.5 & 606.57        & 18.33  \\
206Java \code{notify}                              & 13831.5       & 15698.21      & 4782.3 \\
[64b272a]207\hline
208\end{tabular}
209\end{center}
[07c1e595]210\caption{Internal scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]211\label{tab:int-sched}
[cf966b5]212\end{table}
[64b272a]213
[6090518]214\subsection{External Scheduling}
[cf966b5]215The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
[20ffcf3]216
217\begin{figure}
[cf966b5]218\begin{cfacode}[caption={Benchmark code for external scheduling},label={lst:ext-sched}]
[20ffcf3]219volatile int go = 0;
220monitor M {};
221M m1;
222thread T {};
223
224void __attribute__((noinline)) do_call( M & mutex a1 ) {}
225
226void ^?{}( T & mutex this ) {}
227void main( T & this ) {
228        while(go == 0) { yield(); }
229        while(go == 1) { do_call(m1); }
230}
231int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
232        go = 1;
233        BENCH(
234                for(size_t i=0; i<n; i++) {
235                        waitfor(call, a1);
236                },
237                result
238        )
239        printf("%llu\n", result);
240        go = 0;
241        return 0;
242}
243int main() {
244        T t;
245        return do_wait(m1);
246}
247\end{cfacode}
248\end{figure}
249
[cf966b5]250\begin{table}
[64b272a]251\begin{center}
252\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
253\cline{2-4}
254\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
255\hline
[383e159]256\uC \code{Accept}                                       & 350           & 350.61        & 3.11  \\
257\CFA \code{waitfor}, 1 \code{monitor}   & 358.5 & 358.36        & 3.82  \\
258\CFA \code{waitfor}, 2 \code{monitor}   & 422           & 426.79        & 7.95  \\
259\CFA \code{waitfor}, 4 \code{monitor}   & 579.5 & 585.46        & 11.25 \\
[64b272a]260\hline
261\end{tabular}
262\end{center}
[07c1e595]263\caption{External scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]264\label{tab:ext-sched}
[cf966b5]265\end{table}
[64b272a]266
[6090518]267\subsection{Object Creation}
[cae28da]268Finally, the last benchmark measures the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for \texttt{pthread}s and \CFA threads, with results shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
[20ffcf3]269
270\begin{figure}
[cf966b5]271\begin{center}
[cae28da]272\texttt{pthread}
[cf966b5]273\begin{ccode}
[20ffcf3]274int main() {
275        BENCH(
276                for(size_t i=0; i<n; i++) {
277                        pthread_t thread;
[cf966b5]278                        if(pthread_create(&thread,NULL,foo,NULL)<0) {
[20ffcf3]279                                perror( "failure" );
280                                return 1;
281                        }
282
[cf966b5]283                        if(pthread_join(thread, NULL)<0) {
[20ffcf3]284                                perror( "failure" );
285                                return 1;
286                        }
287                },
288                result
289        )
290        printf("%llu\n", result);
291}
[cf966b5]292\end{ccode}
293
294
295
[20ffcf3]296\CFA Threads
297\begin{cfacode}
298int main() {
299        BENCH(
300                for(size_t i=0; i<n; i++) {
301                        MyThread m;
302                },
303                result
304        )
305        printf("%llu\n", result);
306}
307\end{cfacode}
[cf966b5]308\end{center}
[cae28da]309\begin{cfacode}[caption={Benchmark code for \texttt{pthread}s and \CFA to measure object creation},label={lst:creation}]
[cf966b5]310\end{cfacode}
[20ffcf3]311\end{figure}
312
[cf966b5]313\begin{table}
[64b272a]314\begin{center}
315\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
316\cline{2-4}
317\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
318\hline
[383e159]319Pthreads                        & 26996 & 26984.71      & 156.6  \\
320\CFA Coroutine Lazy     & 6             & 5.71  & 0.45   \\
321\CFA Coroutine Eager    & 708           & 706.68        & 4.82   \\
322\CFA Thread                     & 1173.5        & 1176.18       & 15.18  \\
323\uC Coroutine           & 109           & 107.46        & 1.74   \\
324\uC Thread                      & 526           & 530.89        & 9.73   \\
325Goroutine                       & 2520.5        & 2530.93       & 61,56  \\
326Java Thread                     & 91114.5       & 92272.79      & 961.58 \\
[64b272a]327\hline
328\end{tabular}
329\end{center}
[6090518]330\caption{Creation comparison. All numbers are in nanoseconds(\si{\nano\second}).}
[64b272a]331\label{tab:creation}
[cf966b5]332\end{table}
Note: See TracBrowser for help on using the repository browser.