source: doc/proposals/concurrency/text/results.tex@ 98a249fb

ADT aaron-thesis arm-eh ast-experimental cleanup-dtors deferred_resn demangler enum forall-pointer-decay jacob/cs343-translation jenkins-sandbox new-ast new-ast-unique-expr new-env no_list persistent-indexer pthread-emulation qualifiedEnum resolv-new with_gc
Last change on this file since 98a249fb was 20ffcf3, checked in by Thierry Delisle <tdelisle@…>, 8 years ago

Commit after new draft

  • Property mode set to 100644
File size: 10.7 KB
Line 
1% ======================================================================
2% ======================================================================
3\chapter{Performance results} \label{results}
4% ======================================================================
5% ======================================================================
6\section{Machine setup}
7Table \ref{tab:machine} shows the characteristiques of the machine used to run the benchmarks. All tests where made on this machine.
8\begin{figure}[H]
9\begin{center}
10\begin{tabular}{| l | r | l | r |}
11\hline
12Architecture & x86\_64 & NUMA node(s) & 8 \\
13\hline
14CPU op-mode(s) & 32-bit, 64-bit & Model name & AMD Opteron\texttrademark Processor 6380 \\
15\hline
16Byte Order & Little Endian & CPU Freq & 2.5\si{\giga\hertz} \\
17\hline
18CPU(s) & 64 & L1d cache & \SI{16}{\kibi\byte} \\
19\hline
20Thread(s) per core & 2 & L1i cache & \SI{64}{\kibi\byte} \\
21\hline
22Core(s) per socket & 8 & L2 cache & \SI{2048}{\kibi\byte} \\
23\hline
24Socket(s) & 4 & L3 cache & \SI{6144}{\kibi\byte} \\
25\hline
26\hline
27Operating system & Ubuntu 16.04.3 LTS & Kernel & Linux 4.4.0-97-generic \\
28\hline
29Compiler & gcc 6.3.0 & Translator & CFA 1.0.0 \\
30\hline
31\end{tabular}
32\end{center}
33\caption{Machine setup used for the tests}
34\label{tab:machine}
35\end{figure}
36
37\section{Micro benchmarks}
38All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code :
39\begin{pseudo}
40#define BENCH(run, result)
41 gettime();
42 run;
43 gettime();
44 result = (after - before) / N;
45\end{pseudo}
46The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many interations of a simple call to measure the cost of the call. The specific number of interation dependes on the specific benchmark.
47
48\subsection{Context-switching}
49The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads. All omitted tests are functionally identical to one of these tests. The results can be shown in table \ref{tab:ctx-switch}.
50\begin{figure}
51\begin{multicols}{2}
52\CFA Coroutines
53\begin{cfacode}
54coroutine GreatSuspender {};
55void main(GreatSuspender& this) {
56 while(true) { suspend(); }
57}
58int main() {
59 GreatSuspender s;
60 resume(s);
61 BENCH(
62 for(size_t i=0; i<n; i++) {
63 resume(s);
64 },
65 result
66 )
67 printf("%llu\n", result);
68}
69\end{cfacode}
70\columnbreak
71\CFA Threads
72\begin{cfacode}
73
74
75
76
77int main() {
78
79
80 BENCH(
81 for(size_t i=0; i<n; i++) {
82 yield();
83 },
84 result
85 )
86 printf("%llu\n", result);
87}
88\end{cfacode}
89\end{multicols}
90\caption{\CFA benchmark code used to measure context-switches for coroutines and threads.}
91\label{lst:ctx-switch}
92\end{figure}
93
94\begin{figure}
95\begin{center}
96\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
97\cline{2-4}
98\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
99\hline
100Kernel Threads & 239 & 242.57 & 5.54 \\
101\CFA Coroutines & 38 & 38 & 0 \\
102\CFA Threads & 102 & 102.39 & 1.57 \\
103\uC Coroutines & 46 & 46.68 & 0.47 \\
104\uC Threads & 98 & 99.39 & 1.52 \\
105\hline
106\end{tabular}
107\end{center}
108\caption{Context Switch comparaison. All numbers are in nanoseconds(\si{\nano\second})}
109\label{tab:ctx-switch}
110\end{figure}
111
112\subsection{Mutual-exclusion}
113The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest appraoch is to measure how long it takes enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also mesured. The results can be shown in table \ref{tab:mutex}.
114
115\begin{figure}
116\begin{cfacode}
117monitor M {};
118void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
119
120int main() {
121 M m/*, m2, m3, m4*/;
122 BENCH(
123 for(size_t i=0; i<n; i++) {
124 call(m/*, m2, m3, m4*/);
125 },
126 result
127 )
128 printf("%llu\n", result);
129}
130\end{cfacode}
131\caption{\CFA benchmark code used to measure mutex routines.}
132\label{lst:mutex}
133\end{figure}
134
135\begin{figure}
136\begin{center}
137\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
138\cline{2-4}
139\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
140\hline
141C routine & 2 & 2 & 0 \\
142Pthreads Mutex Lock & 31 & 31.86 & 0.99 \\
143\uC \code{monitor} member routine & 30 & 30 & 0 \\
144\CFA \code{mutex} routine, 1 argument & 46 & 46.14 & 0.74 \\
145\CFA \code{mutex} routine, 2 argument & 82 & 83 & 1.93 \\
146\CFA \code{mutex} routine, 4 argument & 165 & 161.15 & 54.04 \\
147\hline
148\end{tabular}
149\end{center}
150\caption{Mutex routine comparaison. All numbers are in nanoseconds(\si{\nano\second})}
151\label{tab:mutex}
152\end{figure}
153
154\subsection{Internal scheduling}
155The Internal scheduling benchmark measures the cost of waiting on and signaling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA. The results can be shown in table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
156
157\begin{figure}
158\begin{cfacode}
159volatile int go = 0;
160condition c;
161monitor M {};
162M m1;
163
164void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }
165
166thread T {};
167void ^?{}( T & mutex this ) {}
168void main( T & this ) {
169 while(go == 0) { yield(); }
170 while(go == 1) { do_call(m1); }
171}
172int __attribute__((noinline)) do_wait( M & mutex a1 ) {
173 go = 1;
174 BENCH(
175 for(size_t i=0; i<n; i++) {
176 wait(c);
177 },
178 result
179 )
180 printf("%llu\n", result);
181 go = 0;
182 return 0;
183}
184int main() {
185 T t;
186 return do_wait(m1);
187}
188\end{cfacode}
189\caption{Benchmark code for internal scheduling}
190\label{lst:int-sched}
191\end{figure}
192
193\begin{figure}
194\begin{center}
195\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
196\cline{2-4}
197\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
198\hline
199\uC \code{signal} & 322 & 322.57 & 2.77 \\
200\CFA \code{signal}, 1 \code{monitor} & 1145 & 1163.64 & 27.52 \\
201\CFA \code{signal}, 2 \code{monitor} & 1531 & 1550.75 & 32.77 \\
202\CFA \code{signal}, 4 \code{monitor} & 2288.5 & 2326.86 & 54.73 \\
203\hline
204\end{tabular}
205\end{center}
206\caption{Internal scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
207\label{tab:int-sched}
208\end{figure}
209
210\subsection{External scheduling}
211The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA. The results can be shown in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
212
213\begin{figure}
214\begin{cfacode}
215volatile int go = 0;
216monitor M {};
217M m1;
218thread T {};
219
220void __attribute__((noinline)) do_call( M & mutex a1 ) {}
221
222void ^?{}( T & mutex this ) {}
223void main( T & this ) {
224 while(go == 0) { yield(); }
225 while(go == 1) { do_call(m1); }
226}
227int __attribute__((noinline)) do_wait( M & mutex a1 ) {
228 go = 1;
229 BENCH(
230 for(size_t i=0; i<n; i++) {
231 waitfor(call, a1);
232 },
233 result
234 )
235 printf("%llu\n", result);
236 go = 0;
237 return 0;
238}
239int main() {
240 T t;
241 return do_wait(m1);
242}
243\end{cfacode}
244\caption{Benchmark code for external scheduling}
245\label{lst:ext-sched}
246\end{figure}
247
248\begin{figure}
249\begin{center}
250\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
251\cline{2-4}
252\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
253\hline
254\uC \code{Accept} & 349 & 339.32 & 3.14 \\
255\CFA \code{waitfor}, 1 \code{monitor} & 1155.5 & 1142.04 & 25.23 \\
256\CFA \code{waitfor}, 2 \code{monitor} & 1361 & 1376.75 & 28.81 \\
257\CFA \code{waitfor}, 4 \code{monitor} & 1941.5 & 1957.07 & 34.7 \\
258\hline
259\end{tabular}
260\end{center}
261\caption{External scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
262\label{tab:ext-sched}
263\end{figure}
264
265\subsection{Object creation}
266Finaly, the last benchmark measured is the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads. The results can be shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the callstacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
267
268\begin{figure}
269\begin{multicols}{2}
270pthread
271\begin{cfacode}
272int main() {
273 BENCH(
274 for(size_t i=0; i<n; i++) {
275 pthread_t thread;
276 if(pthread_create(
277 &thread,
278 NULL,
279 foo,
280 NULL
281 ) < 0) {
282 perror( "failure" );
283 return 1;
284 }
285
286 if(pthread_join(
287 thread,
288 NULL
289 ) < 0) {
290 perror( "failure" );
291 return 1;
292 }
293 },
294 result
295 )
296 printf("%llu\n", result);
297}
298\end{cfacode}
299\columnbreak
300\CFA Threads
301\begin{cfacode}
302int main() {
303 BENCH(
304 for(size_t i=0; i<n; i++) {
305 MyThread m;
306 },
307 result
308 )
309
310 printf("%llu\n", result);
311}
312\end{cfacode}
313\end{multicols}
314\caption{Bechmark code for pthreads and \CFA to measure object creation}
315\label{lst:creation}
316\end{figure}
317
318\begin{figure}
319\begin{center}
320\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
321\cline{2-4}
322\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
323\hline
324Pthreads & 26974.5 & 26977 & 124.12 \\
325\CFA Coroutines Lazy & 5 & 5 & 0 \\
326\CFA Coroutines Eager & 335.0 & 357.67 & 34.2 \\
327\CFA Threads & 1122.5 & 1109.86 & 36.54 \\
328\uC Coroutines & 106 & 107.04 & 1.61 \\
329\uC Threads & 525.5 & 533.04 & 11.14 \\
330\hline
331\end{tabular}
332\end{center}
333\caption{Creation comparaison. All numbers are in nanoseconds(\si{\nano\second})}
334\label{tab:creation}
335\end{figure}
Note: See TracBrowser for help on using the repository browser.