Changeset 20ffcf3 for doc/proposals/concurrency/text/results.tex
- Timestamp:
- Nov 13, 2017, 10:45:32 AM (6 years ago)
- Branches:
- ADT, aaron-thesis, arm-eh, ast-experimental, cleanup-dtors, deferred_resn, demangler, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, new-env, no_list, persistent-indexer, pthread-emulation, qualifiedEnum, resolv-new, with_gc
- Children:
- b3ffb61
- Parents:
- 6d2386e
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/proposals/concurrency/text/results.tex
r6d2386e r20ffcf3 1 1 % ====================================================================== 2 2 % ====================================================================== 3 \chapter{Performance results} 3 \chapter{Performance results} \label{results} 4 4 % ====================================================================== 5 5 % ====================================================================== 6 7 6 \section{Machine setup} 8 9 \begin{figure} 7 Table \ref{tab:machine} shows the characteristiques of the machine used to run the benchmarks. All tests where made on this machine. 8 \begin{figure}[H] 10 9 \begin{center} 11 10 \begin{tabular}{| l | r | l | r |} … … 37 36 38 37 \section{Micro benchmarks} 38 All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code : 39 \begin{pseudo} 40 #define BENCH(run, result) 41 gettime(); 42 run; 43 gettime(); 44 result = (after - before) / N; 45 \end{pseudo} 46 The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many interations of a simple call to measure the cost of the call. The specific number of interation dependes on the specific benchmark. 47 48 \subsection{Context-switching} 49 The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads. All omitted tests are functionally identical to one of these tests. The results can be shown in table \ref{tab:ctx-switch}. 50 \begin{figure} 51 \begin{multicols}{2} 52 \CFA Coroutines 53 \begin{cfacode} 54 coroutine GreatSuspender {}; 55 void main(GreatSuspender& this) { 56 while(true) { suspend(); } 57 } 58 int main() { 59 GreatSuspender s; 60 resume(s); 61 BENCH( 62 for(size_t i=0; i<n; i++) { 63 resume(s); 64 }, 65 result 66 ) 67 printf("%llu\n", result); 68 } 69 \end{cfacode} 70 \columnbreak 71 \CFA Threads 72 \begin{cfacode} 73 74 75 76 77 int main() { 78 79 80 BENCH( 81 for(size_t i=0; i<n; i++) { 82 yield(); 83 }, 84 result 85 ) 86 printf("%llu\n", result); 87 } 88 \end{cfacode} 89 \end{multicols} 90 \caption{\CFA benchmark code used to measure context-switches for coroutines and threads.} 91 \label{lst:ctx-switch} 92 \end{figure} 39 93 40 94 \begin{figure} … … 54 108 \caption{Context Switch comparaison. All numbers are in nanoseconds(\si{\nano\second})} 55 109 \label{tab:ctx-switch} 110 \end{figure} 111 112 \subsection{Mutual-exclusion} 113 The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest appraoch is to measure how long it takes enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also mesured. The results can be shown in table \ref{tab:mutex}. 114 115 \begin{figure} 116 \begin{cfacode} 117 monitor M {}; 118 void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {} 119 120 int main() { 121 M m/*, m2, m3, m4*/; 122 BENCH( 123 for(size_t i=0; i<n; i++) { 124 call(m/*, m2, m3, m4*/); 125 }, 126 result 127 ) 128 printf("%llu\n", result); 129 } 130 \end{cfacode} 131 \caption{\CFA benchmark code used to measure mutex routines.} 132 \label{lst:mutex} 56 133 \end{figure} 57 134 … … 75 152 \end{figure} 76 153 154 \subsection{Internal scheduling} 155 The Internal scheduling benchmark measures the cost of waiting on and signaling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA. The results can be shown in table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. 156 157 \begin{figure} 158 \begin{cfacode} 159 volatile int go = 0; 160 condition c; 161 monitor M {}; 162 M m1; 163 164 void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); } 165 166 thread T {}; 167 void ^?{}( T & mutex this ) {} 168 void main( T & this ) { 169 while(go == 0) { yield(); } 170 while(go == 1) { do_call(m1); } 171 } 172 int __attribute__((noinline)) do_wait( M & mutex a1 ) { 173 go = 1; 174 BENCH( 175 for(size_t i=0; i<n; i++) { 176 wait(c); 177 }, 178 result 179 ) 180 printf("%llu\n", result); 181 go = 0; 182 return 0; 183 } 184 int main() { 185 T t; 186 return do_wait(m1); 187 } 188 \end{cfacode} 189 \caption{Benchmark code for internal scheduling} 190 \label{lst:int-sched} 191 \end{figure} 192 77 193 \begin{figure} 78 194 \begin{center} … … 92 208 \end{figure} 93 209 210 \subsection{External scheduling} 211 The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA. The results can be shown in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. 212 213 \begin{figure} 214 \begin{cfacode} 215 volatile int go = 0; 216 monitor M {}; 217 M m1; 218 thread T {}; 219 220 void __attribute__((noinline)) do_call( M & mutex a1 ) {} 221 222 void ^?{}( T & mutex this ) {} 223 void main( T & this ) { 224 while(go == 0) { yield(); } 225 while(go == 1) { do_call(m1); } 226 } 227 int __attribute__((noinline)) do_wait( M & mutex a1 ) { 228 go = 1; 229 BENCH( 230 for(size_t i=0; i<n; i++) { 231 waitfor(call, a1); 232 }, 233 result 234 ) 235 printf("%llu\n", result); 236 go = 0; 237 return 0; 238 } 239 int main() { 240 T t; 241 return do_wait(m1); 242 } 243 \end{cfacode} 244 \caption{Benchmark code for external scheduling} 245 \label{lst:ext-sched} 246 \end{figure} 247 94 248 \begin{figure} 95 249 \begin{center} … … 109 263 \end{figure} 110 264 111 \begin{figure} 112 \begin{center} 113 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |} 114 \cline{2-4} 115 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\ 116 \hline 117 Pthreads & 26974.5 & 26977 & 124.12 \\ 118 \CFA Coroutines & 5 & 5 & 0 \\ 119 \CFA Threads & 1122.5 & 1109.86 & 36.54 \\ 120 \uC Coroutines & 106 & 107.04 & 1.61 \\ 121 \uC Threads & 525.5 & 533.04 & 11.14 \\ 265 \subsection{Object creation} 266 Finaly, the last benchmark measured is the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads. The results can be shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the callstacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low. 267 268 \begin{figure} 269 \begin{multicols}{2} 270 pthread 271 \begin{cfacode} 272 int main() { 273 BENCH( 274 for(size_t i=0; i<n; i++) { 275 pthread_t thread; 276 if(pthread_create( 277 &thread, 278 NULL, 279 foo, 280 NULL 281 ) < 0) { 282 perror( "failure" ); 283 return 1; 284 } 285 286 if(pthread_join( 287 thread, 288 NULL 289 ) < 0) { 290 perror( "failure" ); 291 return 1; 292 } 293 }, 294 result 295 ) 296 printf("%llu\n", result); 297 } 298 \end{cfacode} 299 \columnbreak 300 \CFA Threads 301 \begin{cfacode} 302 int main() { 303 BENCH( 304 for(size_t i=0; i<n; i++) { 305 MyThread m; 306 }, 307 result 308 ) 309 310 printf("%llu\n", result); 311 } 312 \end{cfacode} 313 \end{multicols} 314 \caption{Bechmark code for pthreads and \CFA to measure object creation} 315 \label{lst:creation} 316 \end{figure} 317 318 \begin{figure} 319 \begin{center} 320 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |} 321 \cline{2-4} 322 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\ 323 \hline 324 Pthreads & 26974.5 & 26977 & 124.12 \\ 325 \CFA Coroutines Lazy & 5 & 5 & 0 \\ 326 \CFA Coroutines Eager & 335.0 & 357.67 & 34.2 \\ 327 \CFA Threads & 1122.5 & 1109.86 & 36.54 \\ 328 \uC Coroutines & 106 & 107.04 & 1.61 \\ 329 \uC Threads & 525.5 & 533.04 & 11.14 \\ 122 330 \hline 123 331 \end{tabular}
Note: See TracChangeset
for help on using the changeset viewer.