source: doc/proposals/concurrency/text/results.tex @ 07c1e595

ADTaaron-thesisarm-ehast-experimentalcleanup-dtorsdeferred_resndemanglerenumforall-pointer-decayjacob/cs343-translationjenkins-sandboxnew-astnew-ast-unique-exprnew-envno_listpersistent-indexerpthread-emulationqualifiedEnumresolv-newwith_gc
Last change on this file since 07c1e595 was 07c1e595, checked in by Thierry Delisle <tdelisle@…>, 6 years ago

Ran ispell on the thesis

  • Property mode set to 100644
File size: 10.7 KB
RevLine 
[64b272a]1% ======================================================================
2% ======================================================================
[20ffcf3]3\chapter{Performance results} \label{results}
[64b272a]4% ======================================================================
5% ======================================================================
6\section{Machine setup}
[07c1e595]7Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. All tests where made on this machine.
[20ffcf3]8\begin{figure}[H]
[64b272a]9\begin{center}
10\begin{tabular}{| l | r | l | r |}
11\hline
12Architecture            & x86\_64                       & NUMA node(s)  & 8 \\
13\hline
14CPU op-mode(s)          & 32-bit, 64-bit                & Model name    & AMD Opteron\texttrademark  Processor 6380 \\
15\hline
16Byte Order                      & Little Endian                 & CPU Freq              & 2.5\si{\giga\hertz} \\
17\hline
18CPU(s)                  & 64                            & L1d cache     & \SI{16}{\kibi\byte} \\
19\hline
20Thread(s) per core      & 2                             & L1i cache     & \SI{64}{\kibi\byte} \\
21\hline
22Core(s) per socket      & 8                             & L2 cache              & \SI{2048}{\kibi\byte} \\
23\hline
24Socket(s)                       & 4                             & L3 cache              & \SI{6144}{\kibi\byte} \\
25\hline
26\hline
27Operating system                & Ubuntu 16.04.3 LTS    & Kernel                & Linux 4.4.0-97-generic \\
28\hline
[07c1e595]29Compiler                        & GCC 6.3.0             & Translator    & CFA 1.0.0 \\
[64b272a]30\hline
31\end{tabular}
32\end{center}
33\caption{Machine setup used for the tests}
34\label{tab:machine}
35\end{figure}
36
37\section{Micro benchmarks}
[20ffcf3]38All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code :
39\begin{pseudo}
40#define BENCH(run, result)
41        gettime();
42        run;
43        gettime();
44        result = (after - before) / N;
45\end{pseudo}
[07c1e595]46The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many iterations of a simple call to measure the cost of the call. The specific number of iteration depends on the specific benchmark.
[20ffcf3]47
48\subsection{Context-switching}
49The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads. All omitted tests are functionally identical to one of these tests. The results can be shown in table \ref{tab:ctx-switch}.
50\begin{figure}
51\begin{multicols}{2}
52\CFA Coroutines
53\begin{cfacode}
54coroutine GreatSuspender {};
55void main(GreatSuspender& this) {
56        while(true) { suspend(); }
57}
58int main() {
59        GreatSuspender s;
60        resume(s);
61        BENCH(
62                for(size_t i=0; i<n; i++) {
63                        resume(s);
64                },
65                result
66        )
67        printf("%llu\n", result);
68}
69\end{cfacode}
70\columnbreak
71\CFA Threads
72\begin{cfacode}
73
74
75
76
77int main() {
78
79
80        BENCH(
81                for(size_t i=0; i<n; i++) {
82                        yield();
83                },
84                result
85        )
86        printf("%llu\n", result);
87}
88\end{cfacode}
89\end{multicols}
90\caption{\CFA benchmark code used to measure context-switches for coroutines and threads.}
91\label{lst:ctx-switch}
92\end{figure}
[64b272a]93
94\begin{figure}
95\begin{center}
96\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
97\cline{2-4}
98\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
99\hline
100Kernel Threads          & 239           & 242.57        & 5.54 \\
101\CFA Coroutines         & 38            & 38            & 0    \\
102\CFA Threads            & 102           & 102.39        & 1.57 \\
[b9d0fb6]103\uC Coroutines          & 46            & 46.68 & 0.47 \\
104\uC Threads                     & 98            & 99.39 & 1.52 \\
[64b272a]105\hline
106\end{tabular}
107\end{center}
[07c1e595]108\caption{Context Switch comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]109\label{tab:ctx-switch}
110\end{figure}
111
[20ffcf3]112\subsection{Mutual-exclusion}
[07c1e595]113The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest approach is to measure how long it takes enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also measured. The results can be shown in table \ref{tab:mutex}.
[20ffcf3]114
115\begin{figure}
116\begin{cfacode}
117monitor M {};
118void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
119
120int main() {
121        M m/*, m2, m3, m4*/;
122        BENCH(
123                for(size_t i=0; i<n; i++) {
124                        call(m/*, m2, m3, m4*/);
125                },
126                result
127        )
128        printf("%llu\n", result);
129}
130\end{cfacode}
131\caption{\CFA benchmark code used to measure mutex routines.}
132\label{lst:mutex}
133\end{figure}
134
[64b272a]135\begin{figure}
136\begin{center}
137\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
138\cline{2-4}
139\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
140\hline
141C routine                                               & 2             & 2             & 0      \\
142Pthreads Mutex Lock                             & 31            & 31.86 & 0.99   \\
[b9d0fb6]143\uC \code{monitor} member routine               & 30            & 30            & 0      \\
[64b272a]144\CFA \code{mutex} routine, 1 argument   & 46            & 46.14 & 0.74   \\
145\CFA \code{mutex} routine, 2 argument   & 82            & 83            & 1.93   \\
146\CFA \code{mutex} routine, 4 argument   & 165           & 161.15        & 54.04  \\
147\hline
148\end{tabular}
149\end{center}
[07c1e595]150\caption{Mutex routine comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]151\label{tab:mutex}
152\end{figure}
153
[20ffcf3]154\subsection{Internal scheduling}
[07c1e595]155The Internal scheduling benchmark measures the cost of waiting on and signalling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA. The results can be shown in table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
[20ffcf3]156
157\begin{figure}
158\begin{cfacode}
159volatile int go = 0;
160condition c;
161monitor M {};
162M m1;
163
164void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }
165
166thread T {};
167void ^?{}( T & mutex this ) {}
168void main( T & this ) {
169        while(go == 0) { yield(); }
170        while(go == 1) { do_call(m1); }
171}
172int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
173        go = 1;
174        BENCH(
175                for(size_t i=0; i<n; i++) {
176                        wait(c);
177                },
178                result
179        )
180        printf("%llu\n", result);
181        go = 0;
182        return 0;
183}
184int main() {
185        T t;
186        return do_wait(m1);
187}
188\end{cfacode}
189\caption{Benchmark code for internal scheduling}
190\label{lst:int-sched}
191\end{figure}
192
[64b272a]193\begin{figure}
194\begin{center}
195\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
196\cline{2-4}
197\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
198\hline
[b9d0fb6]199\uC \code{signal}                                       & 322           & 322.57        & 2.77  \\
[64b272a]200\CFA \code{signal}, 1 \code{monitor}    & 1145  & 1163.64       & 27.52 \\
201\CFA \code{signal}, 2 \code{monitor}    & 1531  & 1550.75       & 32.77 \\
202\CFA \code{signal}, 4 \code{monitor}    & 2288.5        & 2326.86       & 54.73 \\
203\hline
204\end{tabular}
205\end{center}
[07c1e595]206\caption{Internal scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]207\label{tab:int-sched}
208\end{figure}
209
[20ffcf3]210\subsection{External scheduling}
211The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA. The results can be shown in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
212
213\begin{figure}
214\begin{cfacode}
215volatile int go = 0;
216monitor M {};
217M m1;
218thread T {};
219
220void __attribute__((noinline)) do_call( M & mutex a1 ) {}
221
222void ^?{}( T & mutex this ) {}
223void main( T & this ) {
224        while(go == 0) { yield(); }
225        while(go == 1) { do_call(m1); }
226}
227int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
228        go = 1;
229        BENCH(
230                for(size_t i=0; i<n; i++) {
231                        waitfor(call, a1);
232                },
233                result
234        )
235        printf("%llu\n", result);
236        go = 0;
237        return 0;
238}
239int main() {
240        T t;
241        return do_wait(m1);
242}
243\end{cfacode}
244\caption{Benchmark code for external scheduling}
245\label{lst:ext-sched}
246\end{figure}
247
[64b272a]248\begin{figure}
249\begin{center}
250\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
251\cline{2-4}
252\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
253\hline
[b9d0fb6]254\uC \code{Accept}                                       & 349           & 339.32        & 3.14  \\
[64b272a]255\CFA \code{waitfor}, 1 \code{monitor}   & 1155.5        & 1142.04       & 25.23 \\
256\CFA \code{waitfor}, 2 \code{monitor}   & 1361  & 1376.75       & 28.81 \\
257\CFA \code{waitfor}, 4 \code{monitor}   & 1941.5        & 1957.07       & 34.7  \\
258\hline
259\end{tabular}
260\end{center}
[07c1e595]261\caption{External scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]262\label{tab:ext-sched}
263\end{figure}
264
[20ffcf3]265\subsection{Object creation}
[07c1e595]266Finally, the last benchmark measured is the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads. The results can be shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the call-stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
[20ffcf3]267
268\begin{figure}
269\begin{multicols}{2}
270pthread
271\begin{cfacode}
272int main() {
273        BENCH(
274                for(size_t i=0; i<n; i++) {
275                        pthread_t thread;
276                        if(pthread_create(
277                                &thread,
278                                NULL,
279                                foo,
280                                NULL
281                        ) < 0) {
282                                perror( "failure" );
283                                return 1;
284                        }
285
286                        if(pthread_join(
287                                thread,
288                                NULL
289                        ) < 0) {
290                                perror( "failure" );
291                                return 1;
292                        }
293                },
294                result
295        )
296        printf("%llu\n", result);
297}
298\end{cfacode}
299\columnbreak
300\CFA Threads
301\begin{cfacode}
302int main() {
303        BENCH(
304                for(size_t i=0; i<n; i++) {
305                        MyThread m;
306                },
307                result
308        )
309
310        printf("%llu\n", result);
311}
312\end{cfacode}
313\end{multicols}
[07c1e595]314\caption{Benchmark code for pthreads and \CFA to measure object creation}
[20ffcf3]315\label{lst:creation}
316\end{figure}
317
[64b272a]318\begin{figure}
319\begin{center}
320\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
321\cline{2-4}
322\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
323\hline
[20ffcf3]324Pthreads                        & 26974.5       & 26977 & 124.12 \\
325\CFA Coroutines Lazy    & 5             & 5             & 0      \\
326\CFA Coroutines Eager   & 335.0 & 357.67        & 34.2   \\
327\CFA Threads            & 1122.5        & 1109.86       & 36.54  \\
328\uC Coroutines          & 106           & 107.04        & 1.61   \\
329\uC Threads                     & 525.5 & 533.04        & 11.14  \\
[64b272a]330\hline
331\end{tabular}
332\end{center}
[07c1e595]333\caption{Creation comparison. All numbers are in nanoseconds(\si{\nano\second})}
[64b272a]334\label{tab:creation}
335\end{figure}
Note: See TracBrowser for help on using the repository browser.