Ignore:
Timestamp:
Nov 13, 2017, 10:45:32 AM (6 years ago)
Author:
Thierry Delisle <tdelisle@…>
Branches:
ADT, aaron-thesis, arm-eh, ast-experimental, cleanup-dtors, deferred_resn, demangler, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, new-env, no_list, persistent-indexer, pthread-emulation, qualifiedEnum, resolv-new, with_gc
Children:
b3ffb61
Parents:
6d2386e
Message:

Commit after new draft

File:
1 edited

Legend:

Unmodified
Added
Removed
  • doc/proposals/concurrency/text/results.tex

    r6d2386e r20ffcf3  
    11% ======================================================================
    22% ======================================================================
    3 \chapter{Performance results}
     3\chapter{Performance results} \label{results}
    44% ======================================================================
    55% ======================================================================
    6 
    76\section{Machine setup}
    8 
    9 \begin{figure}
     7Table \ref{tab:machine} shows the characteristiques of the machine used to run the benchmarks. All tests where made on this machine.
     8\begin{figure}[H]
    109\begin{center}
    1110\begin{tabular}{| l | r | l | r |}
     
    3736
    3837\section{Micro benchmarks}
     38All benchmarks are run using the same harness to produce the results, seen as the \code{BENCH()} macro in the following examples. This macro uses the following logic to benchmark the code :
     39\begin{pseudo}
     40#define BENCH(run, result)
     41        gettime();
     42        run;
     43        gettime();
     44        result = (after - before) / N;
     45\end{pseudo}
     46The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many interations of a simple call to measure the cost of the call. The specific number of interation dependes on the specific benchmark.
     47
     48\subsection{Context-switching}
     49The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads. All omitted tests are functionally identical to one of these tests. The results can be shown in table \ref{tab:ctx-switch}.
     50\begin{figure}
     51\begin{multicols}{2}
     52\CFA Coroutines
     53\begin{cfacode}
     54coroutine GreatSuspender {};
     55void main(GreatSuspender& this) {
     56        while(true) { suspend(); }
     57}
     58int main() {
     59        GreatSuspender s;
     60        resume(s);
     61        BENCH(
     62                for(size_t i=0; i<n; i++) {
     63                        resume(s);
     64                },
     65                result
     66        )
     67        printf("%llu\n", result);
     68}
     69\end{cfacode}
     70\columnbreak
     71\CFA Threads
     72\begin{cfacode}
     73
     74
     75
     76
     77int main() {
     78
     79
     80        BENCH(
     81                for(size_t i=0; i<n; i++) {
     82                        yield();
     83                },
     84                result
     85        )
     86        printf("%llu\n", result);
     87}
     88\end{cfacode}
     89\end{multicols}
     90\caption{\CFA benchmark code used to measure context-switches for coroutines and threads.}
     91\label{lst:ctx-switch}
     92\end{figure}
    3993
    4094\begin{figure}
     
    54108\caption{Context Switch comparaison. All numbers are in nanoseconds(\si{\nano\second})}
    55109\label{tab:ctx-switch}
     110\end{figure}
     111
     112\subsection{Mutual-exclusion}
     113The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest appraoch is to measure how long it takes enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also mesured. The results can be shown in table \ref{tab:mutex}.
     114
     115\begin{figure}
     116\begin{cfacode}
     117monitor M {};
     118void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
     119
     120int main() {
     121        M m/*, m2, m3, m4*/;
     122        BENCH(
     123                for(size_t i=0; i<n; i++) {
     124                        call(m/*, m2, m3, m4*/);
     125                },
     126                result
     127        )
     128        printf("%llu\n", result);
     129}
     130\end{cfacode}
     131\caption{\CFA benchmark code used to measure mutex routines.}
     132\label{lst:mutex}
    56133\end{figure}
    57134
     
    75152\end{figure}
    76153
     154\subsection{Internal scheduling}
     155The Internal scheduling benchmark measures the cost of waiting on and signaling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA. The results can be shown in table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
     156
     157\begin{figure}
     158\begin{cfacode}
     159volatile int go = 0;
     160condition c;
     161monitor M {};
     162M m1;
     163
     164void __attribute__((noinline)) do_call( M & mutex a1 ) { signal(c); }
     165
     166thread T {};
     167void ^?{}( T & mutex this ) {}
     168void main( T & this ) {
     169        while(go == 0) { yield(); }
     170        while(go == 1) { do_call(m1); }
     171}
     172int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
     173        go = 1;
     174        BENCH(
     175                for(size_t i=0; i<n; i++) {
     176                        wait(c);
     177                },
     178                result
     179        )
     180        printf("%llu\n", result);
     181        go = 0;
     182        return 0;
     183}
     184int main() {
     185        T t;
     186        return do_wait(m1);
     187}
     188\end{cfacode}
     189\caption{Benchmark code for internal scheduling}
     190\label{lst:int-sched}
     191\end{figure}
     192
    77193\begin{figure}
    78194\begin{center}
     
    92208\end{figure}
    93209
     210\subsection{External scheduling}
     211The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA. The results can be shown in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
     212
     213\begin{figure}
     214\begin{cfacode}
     215volatile int go = 0;
     216monitor M {};
     217M m1;
     218thread T {};
     219
     220void __attribute__((noinline)) do_call( M & mutex a1 ) {}
     221
     222void ^?{}( T & mutex this ) {}
     223void main( T & this ) {
     224        while(go == 0) { yield(); }
     225        while(go == 1) { do_call(m1); }
     226}
     227int  __attribute__((noinline)) do_wait( M & mutex a1 ) {
     228        go = 1;
     229        BENCH(
     230                for(size_t i=0; i<n; i++) {
     231                        waitfor(call, a1);
     232                },
     233                result
     234        )
     235        printf("%llu\n", result);
     236        go = 0;
     237        return 0;
     238}
     239int main() {
     240        T t;
     241        return do_wait(m1);
     242}
     243\end{cfacode}
     244\caption{Benchmark code for external scheduling}
     245\label{lst:ext-sched}
     246\end{figure}
     247
    94248\begin{figure}
    95249\begin{center}
     
    109263\end{figure}
    110264
    111 \begin{figure}
    112 \begin{center}
    113 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    114 \cline{2-4}
    115 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    116 \hline
    117 Pthreads                & 26974.5       & 26977 & 124.12 \\
    118 \CFA Coroutines & 5             & 5             & 0      \\
    119 \CFA Threads    & 1122.5        & 1109.86       & 36.54  \\
    120 \uC Coroutines  & 106           & 107.04        & 1.61   \\
    121 \uC Threads             & 525.5 & 533.04        & 11.14  \\
     265\subsection{Object creation}
     266Finaly, the last benchmark measured is the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads. The results can be shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the callstacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
     267
     268\begin{figure}
     269\begin{multicols}{2}
     270pthread
     271\begin{cfacode}
     272int main() {
     273        BENCH(
     274                for(size_t i=0; i<n; i++) {
     275                        pthread_t thread;
     276                        if(pthread_create(
     277                                &thread,
     278                                NULL,
     279                                foo,
     280                                NULL
     281                        ) < 0) {
     282                                perror( "failure" );
     283                                return 1;
     284                        }
     285
     286                        if(pthread_join(
     287                                thread,
     288                                NULL
     289                        ) < 0) {
     290                                perror( "failure" );
     291                                return 1;
     292                        }
     293                },
     294                result
     295        )
     296        printf("%llu\n", result);
     297}
     298\end{cfacode}
     299\columnbreak
     300\CFA Threads
     301\begin{cfacode}
     302int main() {
     303        BENCH(
     304                for(size_t i=0; i<n; i++) {
     305                        MyThread m;
     306                },
     307                result
     308        )
     309
     310        printf("%llu\n", result);
     311}
     312\end{cfacode}
     313\end{multicols}
     314\caption{Bechmark code for pthreads and \CFA to measure object creation}
     315\label{lst:creation}
     316\end{figure}
     317
     318\begin{figure}
     319\begin{center}
     320\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     321\cline{2-4}
     322\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     323\hline
     324Pthreads                        & 26974.5       & 26977 & 124.12 \\
     325\CFA Coroutines Lazy    & 5             & 5             & 0      \\
     326\CFA Coroutines Eager   & 335.0 & 357.67        & 34.2   \\
     327\CFA Threads            & 1122.5        & 1109.86       & 36.54  \\
     328\uC Coroutines          & 106           & 107.04        & 1.61   \\
     329\uC Threads                     & 525.5 & 533.04        & 11.14  \\
    122330\hline
    123331\end{tabular}
Note: See TracChangeset for help on using the changeset viewer.