Ignore:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • doc/proposals/concurrency/text/results.tex

    r383e159 r20ffcf3  
    55% ======================================================================
    66\section{Machine setup}
    7 Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. All tests where made on this machine.
    8 \begin{table}[H]
     7Table \ref{tab:machine} shows the characteristiques of the machine used to run the benchmarks. All tests where made on this machine.
     8\begin{figure}[H]
    99\begin{center}
    1010\begin{tabular}{| l | r | l | r |}
     
    2525\hline
    2626\hline
    27 Operating system                & Ubuntu 16.04.3 LTS    & Kernel                & Linux 4.4-97-generic \\
    28 \hline
    29 Compiler                        & GCC 6.3               & Translator    & CFA 1 \\
    30 \hline
    31 Java version            & OpenJDK-9             & Go version    & 1.9.2 \\
     27Operating system                & Ubuntu 16.04.3 LTS    & Kernel                & Linux 4.4.0-97-generic \\
     28\hline
     29Compiler                        & gcc 6.3.0             & Translator    & CFA 1.0.0 \\
    3230\hline
    3331\end{tabular}
     
    3533\caption{Machine setup used for the tests}
    3634\label{tab:machine}
    37 \end{table}
     35\end{figure}
    3836
    3937\section{Micro benchmarks}
     
    4139\begin{pseudo}
    4240#define BENCH(run, result)
    43         before = gettime();
     41        gettime();
    4442        run;
    45         after  = gettime();
     43        gettime();
    4644        result = (after - before) / N;
    4745\end{pseudo}
    48 The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many iterations of a simple call to measure the cost of the call. The specific number of iteration depends on the specific benchmark.
     46The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many interations of a simple call to measure the cost of the call. The specific number of interation dependes on the specific benchmark.
    4947
    5048\subsection{Context-switching}
    51 The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch (\gls{uthread} to \gls{kthread} then \gls{kthread} to \gls{uthread}), which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads whith the results in table \ref{tab:ctx-switch}. All omitted tests are functionally identical to one of these tests.
     49The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads. All omitted tests are functionally identical to one of these tests. The results can be shown in table \ref{tab:ctx-switch}.
    5250\begin{figure}
    5351\begin{multicols}{2}
     
    9088\end{cfacode}
    9189\end{multicols}
    92 \begin{cfacode}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={lst:ctx-switch}]
    93 \end{cfacode}
    94 \end{figure}
    95 
    96 \begin{table}
    97 \begin{center}
    98 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    99 \cline{2-4}
    100 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    101 \hline
    102 Kernel Thread   & 241.5 & 243.86        & 5.08 \\
    103 \CFA Coroutine  & 38            & 38            & 0    \\
    104 \CFA Thread             & 103           & 102.96        & 2.96 \\
    105 \uC Coroutine   & 46            & 45.86 & 0.35 \\
    106 \uC Thread              & 98            & 99.11 & 1.42 \\
    107 Goroutine               & 150           & 149.96        & 3.16 \\
    108 Java Thread             & 289           & 290.68        & 8.72 \\
    109 \hline
    110 \end{tabular}
    111 \end{center}
    112 \caption{Context Switch comparison. All numbers are in nanoseconds(\si{\nano\second})}
     90\caption{\CFA benchmark code used to measure context-switches for coroutines and threads.}
     91\label{lst:ctx-switch}
     92\end{figure}
     93
     94\begin{figure}
     95\begin{center}
     96\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     97\cline{2-4}
     98\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     99\hline
     100Kernel Threads          & 239           & 242.57        & 5.54 \\
     101\CFA Coroutines         & 38            & 38            & 0    \\
     102\CFA Threads            & 102           & 102.39        & 1.57 \\
     103\uC Coroutines          & 46            & 46.68 & 0.47 \\
     104\uC Threads                     & 98            & 99.39 & 1.52 \\
     105\hline
     106\end{tabular}
     107\end{center}
     108\caption{Context Switch comparaison. All numbers are in nanoseconds(\si{\nano\second})}
    113109\label{tab:ctx-switch}
    114 \end{table}
     110\end{figure}
    115111
    116112\subsection{Mutual-exclusion}
    117 The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also measured. The results can be shown in table \ref{tab:mutex}.
    118 
    119 \begin{figure}
    120 \begin{cfacode}[caption={\CFA benchmark code used to measure mutex routines.},label={lst:mutex}]
     113The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest appraoch is to measure how long it takes enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also mesured. The results can be shown in table \ref{tab:mutex}.
     114
     115\begin{figure}
     116\begin{cfacode}
    121117monitor M {};
    122118void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
     
    133129}
    134130\end{cfacode}
    135 \end{figure}
    136 
    137 \begin{table}
    138 \begin{center}
    139 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    140 \cline{2-4}
    141 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    142 \hline
    143 C routine                                               & 2             & 2             & 0    \\
    144 FetchAdd + FetchSub                             & 26            & 26            & 0    \\
    145 Pthreads Mutex Lock                             & 31            & 31.86 & 0.99 \\
    146 \uC \code{monitor} member routine               & 30            & 30            & 0    \\
    147 \CFA \code{mutex} routine, 1 argument   & 41            & 41.57 & 0.9  \\
    148 \CFA \code{mutex} routine, 2 argument   & 76            & 76.96 & 1.57 \\
    149 \CFA \code{mutex} routine, 4 argument   & 145           & 146.68        & 3.85 \\
    150 Java synchronized routine                       & 27            & 28.57 & 2.6  \\
    151 \hline
    152 \end{tabular}
    153 \end{center}
    154 \caption{Mutex routine comparison. All numbers are in nanoseconds(\si{\nano\second})}
     131\caption{\CFA benchmark code used to measure mutex routines.}
     132\label{lst:mutex}
     133\end{figure}
     134
     135\begin{figure}
     136\begin{center}
     137\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     138\cline{2-4}
     139\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     140\hline
     141C routine                                               & 2             & 2             & 0      \\
     142Pthreads Mutex Lock                             & 31            & 31.86 & 0.99   \\
     143\uC \code{monitor} member routine               & 30            & 30            & 0      \\
     144\CFA \code{mutex} routine, 1 argument   & 46            & 46.14 & 0.74  \\
     145\CFA \code{mutex} routine, 2 argument   & 82            & 83            & 1.93  \\
     146\CFA \code{mutex} routine, 4 argument   & 165           & 161.15        & 54.04  \\
     147\hline
     148\end{tabular}
     149\end{center}
     150\caption{Mutex routine comparaison. All numbers are in nanoseconds(\si{\nano\second})}
    155151\label{tab:mutex}
    156 \end{table}
     152\end{figure}
    157153
    158154\subsection{Internal scheduling}
    159 The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
    160 
    161 \begin{figure}
    162 \begin{cfacode}[caption={Benchmark code for internal scheduling},label={lst:int-sched}]
     155The Internal scheduling benchmark measures the cost of waiting on and signaling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA. The results can be shown in table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
     156
     157\begin{figure}
     158\begin{cfacode}
    163159volatile int go = 0;
    164160condition c;
     
    191187}
    192188\end{cfacode}
    193 \end{figure}
    194 
    195 \begin{table}
    196 \begin{center}
    197 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    198 \cline{2-4}
    199 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    200 \hline
    201 \uC \code{signal}                                       & 322           & 323   & 3.36   \\
    202 \CFA \code{signal}, 1 \code{monitor}    & 352.5 & 353.11        & 3.66   \\
    203 \CFA \code{signal}, 2 \code{monitor}    & 430           & 430.29        & 8.97   \\
    204 \CFA \code{signal}, 4 \code{monitor}    & 594.5 & 606.57        & 18.33  \\
    205 Java \code{notify}                              & 13831.5       & 15698.21      & 4782.3 \\
    206 \hline
    207 \end{tabular}
    208 \end{center}
    209 \caption{Internal scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
     189\caption{Benchmark code for internal scheduling}
     190\label{lst:int-sched}
     191\end{figure}
     192
     193\begin{figure}
     194\begin{center}
     195\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     196\cline{2-4}
     197\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     198\hline
     199\uC \code{signal}                                       & 322           & 322.57        & 2.77  \\
     200\CFA \code{signal}, 1 \code{monitor}    & 1145  & 1163.64       & 27.52 \\
     201\CFA \code{signal}, 2 \code{monitor}    & 1531  & 1550.75       & 32.77 \\
     202\CFA \code{signal}, 4 \code{monitor}    & 2288.5        & 2326.86       & 54.73 \\
     203\hline
     204\end{tabular}
     205\end{center}
     206\caption{Internal scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
    210207\label{tab:int-sched}
    211 \end{table}
     208\end{figure}
    212209
    213210\subsection{External scheduling}
    214 The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
    215 
    216 \begin{figure}
    217 \begin{cfacode}[caption={Benchmark code for external scheduling},label={lst:ext-sched}]
     211The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA. The results can be shown in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
     212
     213\begin{figure}
     214\begin{cfacode}
    218215volatile int go = 0;
    219216monitor M {};
     
    245242}
    246243\end{cfacode}
    247 \end{figure}
    248 
    249 \begin{table}
    250 \begin{center}
    251 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    252 \cline{2-4}
    253 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    254 \hline
    255 \uC \code{Accept}                                       & 350           & 350.61        & 3.11  \\
    256 \CFA \code{waitfor}, 1 \code{monitor}   & 358.5 & 358.36        & 3.82  \\
    257 \CFA \code{waitfor}, 2 \code{monitor}   & 422           & 426.79        & 7.95  \\
    258 \CFA \code{waitfor}, 4 \code{monitor}   & 579.5 & 585.46        & 11.25 \\
    259 \hline
    260 \end{tabular}
    261 \end{center}
    262 \caption{External scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
     244\caption{Benchmark code for external scheduling}
     245\label{lst:ext-sched}
     246\end{figure}
     247
     248\begin{figure}
     249\begin{center}
     250\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     251\cline{2-4}
     252\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     253\hline
     254\uC \code{Accept}                                       & 349           & 339.32        & 3.14  \\
     255\CFA \code{waitfor}, 1 \code{monitor}   & 1155.5        & 1142.04       & 25.23 \\
     256\CFA \code{waitfor}, 2 \code{monitor}   & 1361  & 1376.75       & 28.81 \\
     257\CFA \code{waitfor}, 4 \code{monitor}   & 1941.5        & 1957.07       & 34.7  \\
     258\hline
     259\end{tabular}
     260\end{center}
     261\caption{External scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
    263262\label{tab:ext-sched}
    264 \end{table}
     263\end{figure}
    265264
    266265\subsection{Object creation}
    267 Finally, the last benchmark measurs the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads, with results shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the call-stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
    268 
    269 \begin{figure}
    270 \begin{center}
     266Finaly, the last benchmark measured is the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads. The results can be shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the callstacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
     267
     268\begin{figure}
     269\begin{multicols}{2}
    271270pthread
    272 \begin{ccode}
     271\begin{cfacode}
    273272int main() {
    274273        BENCH(
    275274                for(size_t i=0; i<n; i++) {
    276275                        pthread_t thread;
    277                         if(pthread_create(&thread,NULL,foo,NULL)<0) {
     276                        if(pthread_create(
     277                                &thread,
     278                                NULL,
     279                                foo,
     280                                NULL
     281                        ) < 0) {
    278282                                perror( "failure" );
    279283                                return 1;
    280284                        }
    281285
    282                         if(pthread_join(thread, NULL)<0) {
     286                        if(pthread_join(
     287                                thread,
     288                                NULL
     289                        ) < 0) {
    283290                                perror( "failure" );
    284291                                return 1;
     
    289296        printf("%llu\n", result);
    290297}
    291 \end{ccode}
    292 
    293 
    294 
     298\end{cfacode}
     299\columnbreak
    295300\CFA Threads
    296301\begin{cfacode}
     
    302307                result
    303308        )
    304         printf("%llu\n", result);
    305 }
    306 \end{cfacode}
    307 \end{center}
    308 \begin{cfacode}[caption={Benchmark code for pthreads and \CFA to measure object creation},label={lst:creation}]
    309 \end{cfacode}
    310 \end{figure}
    311 
    312 \begin{table}
    313 \begin{center}
    314 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    315 \cline{2-4}
    316 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    317 \hline
    318 Pthreads                        & 26996 & 26984.71      & 156.6  \\
    319 \CFA Coroutine Lazy     & 6             & 5.71  & 0.45   \\
    320 \CFA Coroutine Eager    & 708           & 706.68        & 4.82   \\
    321 \CFA Thread                     & 1173.5        & 1176.18       & 15.18  \\
    322 \uC Coroutine           & 109           & 107.46        & 1.74   \\
    323 \uC Thread                      & 526           & 530.89        & 9.73   \\
    324 Goroutine                       & 2520.5        & 2530.93       & 61,56  \\
    325 Java Thread                     & 91114.5       & 92272.79      & 961.58 \\
    326 \hline
    327 \end{tabular}
    328 \end{center}
    329 \caption{Creation comparison. All numbers are in nanoseconds(\si{\nano\second})}
     309
     310        printf("%llu\n", result);
     311}
     312\end{cfacode}
     313\end{multicols}
     314\caption{Bechmark code for pthreads and \CFA to measure object creation}
     315\label{lst:creation}
     316\end{figure}
     317
     318\begin{figure}
     319\begin{center}
     320\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     321\cline{2-4}
     322\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     323\hline
     324Pthreads                        & 26974.5       & 26977 & 124.12 \\
     325\CFA Coroutines Lazy    & 5             & 5             & 0      \\
     326\CFA Coroutines Eager   & 335.0 & 357.67        & 34.2   \\
     327\CFA Threads            & 1122.5        & 1109.86       & 36.54  \\
     328\uC Coroutines          & 106           & 107.04        & 1.61   \\
     329\uC Threads                     & 525.5 & 533.04        & 11.14  \\
     330\hline
     331\end{tabular}
     332\end{center}
     333\caption{Creation comparaison. All numbers are in nanoseconds(\si{\nano\second})}
    330334\label{tab:creation}
    331 \end{table}
     335\end{figure}
Note: See TracChangeset for help on using the changeset viewer.