Ignore:
File:
1 edited

Legend:

Unmodified
Added
Removed
  • doc/proposals/concurrency/text/results.tex

    r20ffcf3 r383e159  
    55% ======================================================================
    66\section{Machine setup}
    7 Table \ref{tab:machine} shows the characteristiques of the machine used to run the benchmarks. All tests where made on this machine.
    8 \begin{figure}[H]
     7Table \ref{tab:machine} shows the characteristics of the machine used to run the benchmarks. All tests where made on this machine.
     8\begin{table}[H]
    99\begin{center}
    1010\begin{tabular}{| l | r | l | r |}
     
    2525\hline
    2626\hline
    27 Operating system                & Ubuntu 16.04.3 LTS    & Kernel                & Linux 4.4.0-97-generic \\
    28 \hline
    29 Compiler                        & gcc 6.3.0             & Translator    & CFA 1.0.0 \\
     27Operating system                & Ubuntu 16.04.3 LTS    & Kernel                & Linux 4.4-97-generic \\
     28\hline
     29Compiler                        & GCC 6.3               & Translator    & CFA 1 \\
     30\hline
     31Java version            & OpenJDK-9             & Go version    & 1.9.2 \\
    3032\hline
    3133\end{tabular}
     
    3335\caption{Machine setup used for the tests}
    3436\label{tab:machine}
    35 \end{figure}
     37\end{table}
    3638
    3739\section{Micro benchmarks}
     
    3941\begin{pseudo}
    4042#define BENCH(run, result)
    41         gettime();
     43        before = gettime();
    4244        run;
    43         gettime();
     45        after  = gettime();
    4446        result = (after - before) / N;
    4547\end{pseudo}
    46 The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many interations of a simple call to measure the cost of the call. The specific number of interation dependes on the specific benchmark.
     48The method used to get time is \code{clock_gettime(CLOCK_THREAD_CPUTIME_ID);}. Each benchmark is using many iterations of a simple call to measure the cost of the call. The specific number of iteration depends on the specific benchmark.
    4749
    4850\subsection{Context-switching}
    49 The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch, which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads. All omitted tests are functionally identical to one of these tests. The results can be shown in table \ref{tab:ctx-switch}.
     51The first interesting benchmark is to measure how long context-switches take. The simplest approach to do this is to yield on a thread, which executes a 2-step context switch. In order to make the comparison fair, coroutines also execute a 2-step context-switch (\gls{uthread} to \gls{kthread} then \gls{kthread} to \gls{uthread}), which is a resume/suspend cycle instead of a yield. Listing \ref{lst:ctx-switch} shows the code for coroutines and threads whith the results in table \ref{tab:ctx-switch}. All omitted tests are functionally identical to one of these tests.
    5052\begin{figure}
    5153\begin{multicols}{2}
     
    8890\end{cfacode}
    8991\end{multicols}
    90 \caption{\CFA benchmark code used to measure context-switches for coroutines and threads.}
    91 \label{lst:ctx-switch}
    92 \end{figure}
    93 
    94 \begin{figure}
    95 \begin{center}
    96 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    97 \cline{2-4}
    98 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    99 \hline
    100 Kernel Threads          & 239           & 242.57        & 5.54 \\
    101 \CFA Coroutines         & 38            & 38            & 0    \\
    102 \CFA Threads            & 102           & 102.39        & 1.57 \\
    103 \uC Coroutines          & 46            & 46.68 & 0.47 \\
    104 \uC Threads                     & 98            & 99.39 & 1.52 \\
    105 \hline
    106 \end{tabular}
    107 \end{center}
    108 \caption{Context Switch comparaison. All numbers are in nanoseconds(\si{\nano\second})}
     92\begin{cfacode}[caption={\CFA benchmark code used to measure context-switches for coroutines and threads.},label={lst:ctx-switch}]
     93\end{cfacode}
     94\end{figure}
     95
     96\begin{table}
     97\begin{center}
     98\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     99\cline{2-4}
     100\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     101\hline
     102Kernel Thread   & 241.5 & 243.86        & 5.08 \\
     103\CFA Coroutine  & 38            & 38            & 0    \\
     104\CFA Thread             & 103           & 102.96        & 2.96 \\
     105\uC Coroutine   & 46            & 45.86 & 0.35 \\
     106\uC Thread              & 98            & 99.11 & 1.42 \\
     107Goroutine               & 150           & 149.96        & 3.16 \\
     108Java Thread             & 289           & 290.68        & 8.72 \\
     109\hline
     110\end{tabular}
     111\end{center}
     112\caption{Context Switch comparison. All numbers are in nanoseconds(\si{\nano\second})}
    109113\label{tab:ctx-switch}
    110 \end{figure}
     114\end{table}
    111115
    112116\subsection{Mutual-exclusion}
    113 The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest appraoch is to measure how long it takes enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also mesured. The results can be shown in table \ref{tab:mutex}.
    114 
    115 \begin{figure}
    116 \begin{cfacode}
     117The next interesting benchmark is to measure the overhead to enter/leave a critical-section. For monitors, the simplest approach is to measure how long it takes to enter and leave a monitor routine. Listing \ref{lst:mutex} shows the code for \CFA. To put the results in context, the cost of entering a non-inline function and the cost of acquiring and releasing a pthread mutex lock are also measured. The results can be shown in table \ref{tab:mutex}.
     118
     119\begin{figure}
     120\begin{cfacode}[caption={\CFA benchmark code used to measure mutex routines.},label={lst:mutex}]
    117121monitor M {};
    118122void __attribute__((noinline)) call( M & mutex m /*, m2, m3, m4*/ ) {}
     
    129133}
    130134\end{cfacode}
    131 \caption{\CFA benchmark code used to measure mutex routines.}
    132 \label{lst:mutex}
    133 \end{figure}
    134 
    135 \begin{figure}
    136 \begin{center}
    137 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    138 \cline{2-4}
    139 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    140 \hline
    141 C routine                                               & 2             & 2             & 0      \\
    142 Pthreads Mutex Lock                             & 31            & 31.86 & 0.99   \\
    143 \uC \code{monitor} member routine               & 30            & 30            & 0      \\
    144 \CFA \code{mutex} routine, 1 argument   & 46            & 46.14 & 0.74  \\
    145 \CFA \code{mutex} routine, 2 argument   & 82            & 83            & 1.93  \\
    146 \CFA \code{mutex} routine, 4 argument   & 165           & 161.15        & 54.04  \\
    147 \hline
    148 \end{tabular}
    149 \end{center}
    150 \caption{Mutex routine comparaison. All numbers are in nanoseconds(\si{\nano\second})}
     135\end{figure}
     136
     137\begin{table}
     138\begin{center}
     139\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     140\cline{2-4}
     141\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     142\hline
     143C routine                                               & 2             & 2             & 0    \\
     144FetchAdd + FetchSub                             & 26            & 26            & 0    \\
     145Pthreads Mutex Lock                             & 31            & 31.86 & 0.99 \\
     146\uC \code{monitor} member routine               & 30            & 30            & 0    \\
     147\CFA \code{mutex} routine, 1 argument   & 41            & 41.57 & 0.9  \\
     148\CFA \code{mutex} routine, 2 argument   & 76            & 76.96 & 1.57 \\
     149\CFA \code{mutex} routine, 4 argument   & 145           & 146.68        & 3.85 \\
     150Java synchronized routine                       & 27            & 28.57 & 2.6  \\
     151\hline
     152\end{tabular}
     153\end{center}
     154\caption{Mutex routine comparison. All numbers are in nanoseconds(\si{\nano\second})}
    151155\label{tab:mutex}
    152 \end{figure}
     156\end{table}
    153157
    154158\subsection{Internal scheduling}
    155 The Internal scheduling benchmark measures the cost of waiting on and signaling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA. The results can be shown in table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
    156 
    157 \begin{figure}
    158 \begin{cfacode}
     159The internal-scheduling benchmark measures the cost of waiting on and signalling a condition variable. Listing \ref{lst:int-sched} shows the code for \CFA, with results table \ref{tab:int-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
     160
     161\begin{figure}
     162\begin{cfacode}[caption={Benchmark code for internal scheduling},label={lst:int-sched}]
    159163volatile int go = 0;
    160164condition c;
     
    187191}
    188192\end{cfacode}
    189 \caption{Benchmark code for internal scheduling}
    190 \label{lst:int-sched}
    191 \end{figure}
    192 
    193 \begin{figure}
    194 \begin{center}
    195 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    196 \cline{2-4}
    197 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    198 \hline
    199 \uC \code{signal}                                       & 322           & 322.57        & 2.77  \\
    200 \CFA \code{signal}, 1 \code{monitor}    & 1145  & 1163.64       & 27.52 \\
    201 \CFA \code{signal}, 2 \code{monitor}    & 1531  & 1550.75       & 32.77 \\
    202 \CFA \code{signal}, 4 \code{monitor}    & 2288.5        & 2326.86       & 54.73 \\
    203 \hline
    204 \end{tabular}
    205 \end{center}
    206 \caption{Internal scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
     193\end{figure}
     194
     195\begin{table}
     196\begin{center}
     197\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     198\cline{2-4}
     199\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     200\hline
     201\uC \code{signal}                                       & 322           & 323   & 3.36   \\
     202\CFA \code{signal}, 1 \code{monitor}    & 352.5 & 353.11        & 3.66   \\
     203\CFA \code{signal}, 2 \code{monitor}    & 430           & 430.29        & 8.97   \\
     204\CFA \code{signal}, 4 \code{monitor}    & 594.5 & 606.57        & 18.33  \\
     205Java \code{notify}                              & 13831.5       & 15698.21      & 4782.3 \\
     206\hline
     207\end{tabular}
     208\end{center}
     209\caption{Internal scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
    207210\label{tab:int-sched}
    208 \end{figure}
     211\end{table}
    209212
    210213\subsection{External scheduling}
    211 The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA. The results can be shown in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
    212 
    213 \begin{figure}
    214 \begin{cfacode}
     214The Internal scheduling benchmark measures the cost of the \code{waitfor} statement (\code{_Accept} in \uC). Listing \ref{lst:ext-sched} shows the code for \CFA, with results in table \ref{tab:ext-sched}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests.
     215
     216\begin{figure}
     217\begin{cfacode}[caption={Benchmark code for external scheduling},label={lst:ext-sched}]
    215218volatile int go = 0;
    216219monitor M {};
     
    242245}
    243246\end{cfacode}
    244 \caption{Benchmark code for external scheduling}
    245 \label{lst:ext-sched}
    246 \end{figure}
    247 
    248 \begin{figure}
    249 \begin{center}
    250 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    251 \cline{2-4}
    252 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    253 \hline
    254 \uC \code{Accept}                                       & 349           & 339.32        & 3.14  \\
    255 \CFA \code{waitfor}, 1 \code{monitor}   & 1155.5        & 1142.04       & 25.23 \\
    256 \CFA \code{waitfor}, 2 \code{monitor}   & 1361  & 1376.75       & 28.81 \\
    257 \CFA \code{waitfor}, 4 \code{monitor}   & 1941.5        & 1957.07       & 34.7  \\
    258 \hline
    259 \end{tabular}
    260 \end{center}
    261 \caption{External scheduling comparaison. All numbers are in nanoseconds(\si{\nano\second})}
     247\end{figure}
     248
     249\begin{table}
     250\begin{center}
     251\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     252\cline{2-4}
     253\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     254\hline
     255\uC \code{Accept}                                       & 350           & 350.61        & 3.11  \\
     256\CFA \code{waitfor}, 1 \code{monitor}   & 358.5 & 358.36        & 3.82  \\
     257\CFA \code{waitfor}, 2 \code{monitor}   & 422           & 426.79        & 7.95  \\
     258\CFA \code{waitfor}, 4 \code{monitor}   & 579.5 & 585.46        & 11.25 \\
     259\hline
     260\end{tabular}
     261\end{center}
     262\caption{External scheduling comparison. All numbers are in nanoseconds(\si{\nano\second})}
    262263\label{tab:ext-sched}
    263 \end{figure}
     264\end{table}
    264265
    265266\subsection{Object creation}
    266 Finaly, the last benchmark measured is the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads. The results can be shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the callstacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
    267 
    268 \begin{figure}
    269 \begin{multicols}{2}
     267Finally, the last benchmark measurs the cost of creation for concurrent objects. Listing \ref{lst:creation} shows the code for pthreads and \CFA threads, with results shown in table \ref{tab:creation}. As with all other benchmarks, all omitted tests are functionally identical to one of these tests. The only note here is that the call-stacks of \CFA coroutines are lazily created, therefore without priming the coroutine, the creation cost is very low.
     268
     269\begin{figure}
     270\begin{center}
    270271pthread
    271 \begin{cfacode}
     272\begin{ccode}
    272273int main() {
    273274        BENCH(
    274275                for(size_t i=0; i<n; i++) {
    275276                        pthread_t thread;
    276                         if(pthread_create(
    277                                 &thread,
    278                                 NULL,
    279                                 foo,
    280                                 NULL
    281                         ) < 0) {
     277                        if(pthread_create(&thread,NULL,foo,NULL)<0) {
    282278                                perror( "failure" );
    283279                                return 1;
    284280                        }
    285281
    286                         if(pthread_join(
    287                                 thread,
    288                                 NULL
    289                         ) < 0) {
     282                        if(pthread_join(thread, NULL)<0) {
    290283                                perror( "failure" );
    291284                                return 1;
     
    296289        printf("%llu\n", result);
    297290}
    298 \end{cfacode}
    299 \columnbreak
     291\end{ccode}
     292
     293
     294
    300295\CFA Threads
    301296\begin{cfacode}
     
    307302                result
    308303        )
    309 
    310         printf("%llu\n", result);
    311 }
    312 \end{cfacode}
    313 \end{multicols}
    314 \caption{Bechmark code for pthreads and \CFA to measure object creation}
    315 \label{lst:creation}
    316 \end{figure}
    317 
    318 \begin{figure}
    319 \begin{center}
    320 \begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
    321 \cline{2-4}
    322 \multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
    323 \hline
    324 Pthreads                        & 26974.5       & 26977 & 124.12 \\
    325 \CFA Coroutines Lazy    & 5             & 5             & 0      \\
    326 \CFA Coroutines Eager   & 335.0 & 357.67        & 34.2   \\
    327 \CFA Threads            & 1122.5        & 1109.86       & 36.54  \\
    328 \uC Coroutines          & 106           & 107.04        & 1.61   \\
    329 \uC Threads                     & 525.5 & 533.04        & 11.14  \\
    330 \hline
    331 \end{tabular}
    332 \end{center}
    333 \caption{Creation comparaison. All numbers are in nanoseconds(\si{\nano\second})}
     304        printf("%llu\n", result);
     305}
     306\end{cfacode}
     307\end{center}
     308\begin{cfacode}[caption={Benchmark code for pthreads and \CFA to measure object creation},label={lst:creation}]
     309\end{cfacode}
     310\end{figure}
     311
     312\begin{table}
     313\begin{center}
     314\begin{tabular}{| l | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] | S[table-format=5.2,table-number-alignment=right] |}
     315\cline{2-4}
     316\multicolumn{1}{c |}{} & \multicolumn{1}{c |}{ Median } &\multicolumn{1}{c |}{ Average } & \multicolumn{1}{c |}{ Standard Deviation} \\
     317\hline
     318Pthreads                        & 26996 & 26984.71      & 156.6  \\
     319\CFA Coroutine Lazy     & 6             & 5.71  & 0.45   \\
     320\CFA Coroutine Eager    & 708           & 706.68        & 4.82   \\
     321\CFA Thread                     & 1173.5        & 1176.18       & 15.18  \\
     322\uC Coroutine           & 109           & 107.46        & 1.74   \\
     323\uC Thread                      & 526           & 530.89        & 9.73   \\
     324Goroutine                       & 2520.5        & 2530.93       & 61,56  \\
     325Java Thread                     & 91114.5       & 92272.79      & 961.58 \\
     326\hline
     327\end{tabular}
     328\end{center}
     329\caption{Creation comparison. All numbers are in nanoseconds(\si{\nano\second})}
    334330\label{tab:creation}
    335 \end{figure}
     331\end{table}
Note: See TracChangeset for help on using the changeset viewer.