source: doc/proposals/concurrency/text/results.tex @ a5b7905

ADTaaron-thesisarm-ehast-experimentalcleanup-dtorsdeferred_resndemanglerenumforall-pointer-decayjacob/cs343-translationjenkins-sandboxnew-astnew-ast-unique-exprnew-envno_listpersistent-indexerpthread-emulationqualifiedEnumresolv-newwith_gc
Last change on this file since a5b7905 was 20ffcf3, checked in by Thierry Delisle <tdelisle@…>, 7 years ago

Commit after new draft

  • Property mode set to 100644
File size: 10.7 KB
Line 
1% ======================================================================
2% ======================================================================
3\chapter{Performance results} \label{results}
4% ======================================================================
5% ======================================================================
6\section{Machine setup}
7Table \ref{tab:machine} shows the characteristiques of the machine used to run the benchmarks. All tests where made on this machine.
8\begin{figure}[H]
9\begin{center}
10\begin{tabular}{| l | r | l | r |}
11\hline
12Architecture            & x86\_64                       & NUMA node(s)  & 8 \\
13\hline
14CPU op-mode(s)          & 32-bit, 64-bit                & Model name    & AMD Opteron\texttrademark  Processor 6380 \\
15\hline
16Byte Order                      & Little Endian                 & CPU Freq              & 2.5\si{\giga\hertz} \\
17\hline
18CPU(s)                  & 64                            & L1d cache     & \SI{16}{\kibi\byte} \\
19\hline
20Thread(s) per core      & 2                             & L1i cache     & \SI{64}{\kibi\byte} \\
21\hline
22Core(s) per socket      & 8                             & L2 cache              & \SI{2048}{\kibi\byte} \\
23\hline
24Socket(s)                       & 4                             & L3 cache              & \SI{6144}{\kibi\byte} \\
25\hline
26\hline
27Operating system                & Ubuntu 16.04.3 LTS    & Kernel                & Linux 4.4.0-97-generic \\
28\hline
29Compiler                        & gcc 6.3.0             & Translator    & CFA 1.0.0 \\
30\hline
31\end{tabular}
32\end{center}
33\caption{Machine setup used for the tests}
34\label{tab:machine}
35\end{figure}
36
37\section{Micro benchmarks}
38All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code :
39\begin{pseudo}
40#define BENCH(run, result)
41        gettime();
42        run;
43        gettime();
44        result = (after - before) / N;
45\end{pseudo}
46The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many interations of a simple call to measure the cost of the call. The specific number of interation dependes on the specific benchmark.
47
48\subsection{Context-switching}
49The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads. All omitted tests are functionally identical to one of these tests. The results can be shown in table \ref{tab:ctx-switch}.
50\begin{figure}
51\begin{multicols}{2}
52\CFA Coroutines
53\begin{cfacode}
54coroutine GreatSuspender {};
55void main(GreatSuspender& this) {
56        while(true) { suspend(); }
57}
58int main() {
59        GreatSuspender s;
60        resume(s);
61        BENCH(
62                for(size_t i=0; i<n; i++) {
63                        resume(s);
64                },
65                result
66        )
67        printf("%llu\n", result);
68}
69\end{cfacode}
70\columnbreak
71\CFA Threads
72\begin{cfacode}
73
74
75
76
77int main() {
78
79
80        BENCH(
81                for(size_t i=0; i<n; i++) {
82                        yield();
83                },
84                result
85        )
86        printf("%llu\n", result);
87}
88\end{cfacode}
89\end{multicols}
90\caption{\CFA benchmark code used to measure context-switches for coroutines and threads.}
91\label{lst:ctx-switch}
92\end{figure}
93
94\begin{figure}
95\begin{center}
96\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
97\cline{2-4}
98\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
99\hline
100Kernel Threads          & 239           & 242.57        & 5.54 \\
101\CFA Coroutines         & 38            & 38            & 0    \\
102\CFA Threads            & 102           & 102.39        & 1.57 \\
103\uC Coroutines          & 46            & 46.68 & 0.47 \\
104\uC Threads                     & 98            & 99.39 & 1.52 \\
105\hline
106\end{tabular}
107\end{center}
108\caption{Context Switch comparaison. All numbers are in nanoseconds(\si{\nano\second})}
109\label{tab:ctx-switch}
110\end{figure}
111
112\subsection{Mutual-exclusion}
113The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest appraoch is to measure how long it takes enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also mesured. The results can be shown in table \ref{tab:mutex}.
114
115\begin{figure}
116\begin{cfacode}
117monitor M {};
118void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
119
120int main() {
121        M m/*, m2, m3, m4*/;
122        BENCH(
123                for(size_t i=0; i<n; i++) {
124                        call(m/*, m2, m3, m4*/);
125                },
126                result
127        )
128        printf("%llu\n", result);
129}
130\end{cfacode}
131\caption{\CFA benchmark code used to measure mutex routines.}
132\label{lst:mutex}
133\end{figure}
134
135\begin{figure}
136\begin{center}
137\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
138\cline{2-4}
139\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
140\hline
141C routine                                               & 2             & 2             & 0      \\
142Pthreads Mutex Lock                             & 31            & 31.86 & 0.99   \\
143\uC \code{monitor} member routine               & 30            & 30            & 0      \\
144\CFA \code{mutex} routine, 1 argument   & 46            & 46.14 & 0.74   \\
145\CFA \code{mutex} routine, 2 argument   & 82            & 83            & 1.93   \\
146\CFA \code{mutex} routine, 4 argument   & 165           & 161.15        & 54.04  \\
147\hline
148\end{tabular}
149\end{center}
150\caption{Mutex routine comparaison. All numbers are in nanoseconds(\si{\nano\second})}
151\label{tab:mutex}
152\end{figure}
153
154\subsection{Internal scheduling}
155The Internal scheduling benchmark measures the cost of waiting on and signaling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA. The results can be shown in table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
156
157\begin{figure}
158\begin{cfacode}
159volatile int go = 0;
160condition c;
161monitor M {};
162M m1;
163
164void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }
165
166thread T {};
167void ^?{}( T & mutex this ) {}
168void main( T & this ) {
169        while(go == 0) { yield(); }
170        while(go == 1) { do_call(m1); }
171}
172int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
173        go = 1;
174        BENCH(
175                for(size_t i=0; i<n; i++) {
176                        wait(c);
177                },
178                result
179        )
180        printf("%llu\n", result);
181        go = 0;
182        return 0;
183}
184int main() {
185        T t;
186        return do_wait(m1);
187}
188\end{cfacode}
189\caption{Benchmark code for internal scheduling}
190\label{lst:int-sched}
191\end{figure}
192
193\begin{figure}
194\begin{center}
195\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
196\cline{2-4}
197\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
198\hline
199\uC \code{signal}                                       & 322           & 322.57        & 2.77  \\
200\CFA \code{signal}, 1 \code{monitor}    & 1145  & 1163.64       & 27.52 \\
201\CFA \code{signal}, 2 \code{monitor}    & 1531  & 1550.75       & 32.77 \\
202\CFA \code{signal}, 4 \code{monitor}    & 2288.5        & 2326.86       & 54.73 \\
203\hline
204\end{tabular}
205\end{center}
206\caption{Internal scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
207\label{tab:int-sched}
208\end{figure}
209
210\subsection{External scheduling}
211The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA. The results can be shown in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
212
213\begin{figure}
214\begin{cfacode}
215volatile int go = 0;
216monitor M {};
217M m1;
218thread T {};
219
220void __attribute__((noinline)) do_call( M & mutex a1 ) {}
221
222void ^?{}( T & mutex this ) {}
223void main( T & this ) {
224        while(go == 0) { yield(); }
225        while(go == 1) { do_call(m1); }
226}
227int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
228        go = 1;
229        BENCH(
230                for(size_t i=0; i<n; i++) {
231                        waitfor(call, a1);
232                },
233                result
234        )
235        printf("%llu\n", result);
236        go = 0;
237        return 0;
238}
239int main() {
240        T t;
241        return do_wait(m1);
242}
243\end{cfacode}
244\caption{Benchmark code for external scheduling}
245\label{lst:ext-sched}
246\end{figure}
247
248\begin{figure}
249\begin{center}
250\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
251\cline{2-4}
252\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
253\hline
254\uC \code{Accept}                                       & 349           & 339.32        & 3.14  \\
255\CFA \code{waitfor}, 1 \code{monitor}   & 1155.5        & 1142.04       & 25.23 \\
256\CFA \code{waitfor}, 2 \code{monitor}   & 1361  & 1376.75       & 28.81 \\
257\CFA \code{waitfor}, 4 \code{monitor}   & 1941.5        & 1957.07       & 34.7  \\
258\hline
259\end{tabular}
260\end{center}
261\caption{External scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
262\label{tab:ext-sched}
263\end{figure}
264
265\subsection{Object creation}
266Finaly, the last benchmark measured is the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads. The results can be shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the callstacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
267
268\begin{figure}
269\begin{multicols}{2}
270pthread
271\begin{cfacode}
272int main() {
273        BENCH(
274                for(size_t i=0; i<n; i++) {
275                        pthread_t thread;
276                        if(pthread_create(
277                                &thread,
278                                NULL,
279                                foo,
280                                NULL
281                        ) < 0) {
282                                perror( "failure" );
283                                return 1;
284                        }
285
286                        if(pthread_join(
287                                thread,
288                                NULL
289                        ) < 0) {
290                                perror( "failure" );
291                                return 1;
292                        }
293                },
294                result
295        )
296        printf("%llu\n", result);
297}
298\end{cfacode}
299\columnbreak
300\CFA Threads
301\begin{cfacode}
302int main() {
303        BENCH(
304                for(size_t i=0; i<n; i++) {
305                        MyThread m;
306                },
307                result
308        )
309
310        printf("%llu\n", result);
311}
312\end{cfacode}
313\end{multicols}
314\caption{Bechmark code for pthreads and \CFA to measure object creation}
315\label{lst:creation}
316\end{figure}
317
318\begin{figure}
319\begin{center}
320\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
321\cline{2-4}
322\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
323\hline
324Pthreads                        & 26974.5       & 26977 & 124.12 \\
325\CFA Coroutines Lazy    & 5             & 5             & 0      \\
326\CFA Coroutines Eager   & 335.0 & 357.67        & 34.2   \\
327\CFA Threads            & 1122.5        & 1109.86       & 36.54  \\
328\uC Coroutines          & 106           & 107.04        & 1.61   \\
329\uC Threads                     & 525.5 & 533.04        & 11.14  \\
330\hline
331\end{tabular}
332\end{center}
333\caption{Creation comparaison. All numbers are in nanoseconds(\si{\nano\second})}
334\label{tab:creation}
335\end{figure}
Note: See TracBrowser for help on using the repository browser.