Changeset 397edf7
- Timestamp:
- Jun 24, 2019, 1:25:54 PM (5 years ago)
- Branches:
- ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, pthread-emulation, qualifiedEnum
- Children:
- 3253c32
- Parents:
- 9e0a360
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/papers/concurrency/Paper.tex
r9e0a360 r397edf7 2705 2705 \label{results} 2706 2706 2707 To verify the implementation of the \CFA runtime, a series of microbenchmarks are performed comparing \CFA with Java OpenJDK-9, Go 1.9.2 and \uC 7.0.0. 2708 The benchmark computer is an AMD Opteron\texttrademark\ 6380 NUMA 64-core, 8 socket, 2.5 GHz processor, running Ubuntu 16.04.6 LTS, and \uC/\CFA are compiled with gcc 6.5. 2709 2710 \begin{comment} 2711 \begin{table} 2712 \centering 2713 \caption{Experiment environment} 2714 \label{t:machine} 2715 2716 \begin{tabular}{|l|r||l|r|} 2717 \hline 2718 Architecture & x86\_64 & NUMA node(s) & 8 \\ 2719 \hline 2720 CPU op-mode(s) & 32-bit, 64-bit & Model name & AMD Opteron\texttrademark\ Processor 6380 \\ 2721 \hline 2722 Byte Order & Little Endian & CPU Freq & 2.5 GHz \\ 2723 \hline 2724 CPU(s) & 64 & L1d cache & 16 KiB \\ 2725 \hline 2726 Thread(s) per core & 2 & L1i cache & 64 KiB \\ 2727 \hline 2728 Core(s) per socket & 8 & L2 cache & 2048 KiB \\ 2729 \hline 2730 Socket(s) & 4 & L3 cache & 6144 KiB \\ 2731 \hline 2732 \hline 2733 Operating system & Ubuntu 16.04.3 LTS & Kernel & Linux 4.4-97-generic \\ 2734 \hline 2735 gcc & 6.3 & \CFA & 1.0.0 \\ 2736 \hline 2737 Java & OpenJDK-9 & Go & 1.9.2 \\ 2738 \hline 2739 \end{tabular} 2740 \end{table} 2741 \end{comment} 2707 To verify the implementation of the \CFA runtime, a series of microbenchmarks are performed comparing \CFA with pthreads, Java OpenJDK-9, Go 1.12.6 and \uC 7.0.0. 2708 For comparison, the package must be multi-processor (M:N), which excludes libdill/libmil~\cite{libdill} (M:1)), and use a shared-memory programming model, \eg not message passing. 2709 The benchmark computer is an AMD Opteron\texttrademark\ 6380 NUMA 64-core, 8 socket, 2.5 GHz processor, running Ubuntu 16.04.6 LTS, and \CFA/\uC are compiled with gcc 6.5. 2742 2710 2743 2711 All benchmarks are run using the following harness. … … 2749 2717 Each benchmark is performed @N@ times, where @N@ varies depending on the benchmark; 2750 2718 the total time is divided by @N@ to obtain the average time for a benchmark. 2719 Each benchmark experiment is run 31 times. 2751 2720 All omitted tests for other languages are functionally identical to the \CFA tests and available online~\cite{CforallBenchMarks}. 2752 2721 … … 2779 2748 \begin{tabular}[t]{@{}r*{3}{D{.}{.}{5.2}}@{}} 2780 2749 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2781 Pthreads & 28091 & 28073.39 & 163.1 \\ 2782 \CFA Coroutine Lazy & 6 & 6.07 & 0.26 \\ 2783 \CFA Coroutine Eager & 520 & 520.61 & 2.04 \\ 2784 \CFA Thread & 2032 & 2016.29 & 112.07 \\ 2785 \uC Coroutine & 106 & 107.36 & 1.47 \\ 2786 \uC Thread & 536.5 & 537.07 & 4.64 \\ 2787 Goroutine & 3103 & 3086.29 & 90.25 \\ 2788 Java Thread & 103416.5 & 103732.29 & 1137 \\ 2750 \CFA Coroutine Lazy & 14.3 & 14.3 & 0.32 \\ 2751 \CFA Coroutine Eager & 2203.7 & 2205.6 & 26.03 \\ 2752 \CFA Thread & 1257.8 & 1291.2 & 86.19 \\ 2753 \uC Coroutine & 92.2 & 91.4 & 1.58 \\ 2754 \uC Thread & 499.5 & 500.1 & 5.67 \\ 2755 Goroutine & 4397.0 & 4362.8 & 390.77 \\ 2756 Java Thread & 107405.0 & 107794.8 & 1601.33 \\ 2757 % Qthreads & 159.9 & 159.6 & 0.73 \\ 2758 Pthreads & 32920.9 & 32882.7 & 213.55 2759 \end{tabular} 2760 \end{multicols} 2761 2762 2763 \paragraph{Internal Scheduling} 2764 2765 Internal scheduling is measured using a cycle of two threads signalling and waiting. 2766 Figure~\ref{f:int-sched} shows the code for \CFA, with results in Table~\ref{tab:int-sched}. 2767 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2768 Java scheduling is significantly greater because the benchmark explicitly creates multiple thread in order to prevent the JIT from making the program sequential, \ie removing all locking. 2769 2770 \begin{multicols}{2} 2771 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2772 \begin{cfa} 2773 volatile int go = 0; 2774 @monitor@ M { @condition c;@ } m; 2775 void __attribute__((noinline)) 2776 do_call( M & @mutex@ a1 ) { @signal( c );@ } 2777 thread T {}; 2778 void main( T & this ) { 2779 while ( go == 0 ) { yield(); } 2780 while ( go == 1 ) { do_call( m ); } 2781 } 2782 int __attribute__((noinline)) 2783 do_wait( M & mutex m ) with(m) { 2784 go = 1; // continue other thread 2785 BENCH( for ( N ) { @wait( c );@ } ); 2786 go = 0; // stop other thread 2787 sout | result`ns; 2788 } 2789 int main() { 2790 T t; 2791 do_wait( m ); 2792 } 2793 \end{cfa} 2794 \captionof{figure}{\CFA Internal-scheduling benchmark} 2795 \label{f:int-sched} 2796 2797 \columnbreak 2798 2799 \vspace*{-16pt} 2800 \captionof{table}{Internal-scheduling comparison (nanoseconds)} 2801 \label{tab:int-sched} 2802 \bigskip 2803 2804 \begin{tabular}{@{}r*{3}{D{.}{.}{5.2}}@{}} 2805 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2806 \CFA @signal@, 1 @monitor@ & 367.0 & 371.5 & 17.34 \\ 2807 \CFA @signal@, 2 @monitor@ & 477.2 & 478.6 & 8.31 \\ 2808 \CFA @signal@, 4 @monitor@ & 725.8 & 734.0 & 17.98 \\ 2809 \uC @signal@ & 322.8 & 323.0 & 3.64 \\ 2810 Java @notify@ & 16520.0 & 20096.7 & 9378.53 \\ 2811 Pthreads Cond. Variable & 4931.3 & 5057.0 & 326.80 2812 \end{tabular} 2813 \end{multicols} 2814 2815 2816 \paragraph{External Scheduling} 2817 2818 External scheduling is measured using a cycle of two threads calling and accepting the call using the @waitfor@ statement. 2819 Figure~\ref{f:ext-sched} shows the code for \CFA, with results in Table~\ref{tab:ext-sched}. 2820 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2821 2822 \begin{multicols}{2} 2823 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2824 \vspace*{-16pt} 2825 \begin{cfa} 2826 volatile int go = 0; 2827 @monitor@ M {} m; 2828 thread T {}; 2829 void __attribute__((noinline)) 2830 do_call( M & @mutex@ ) {} 2831 void main( T & ) { 2832 while ( go == 0 ) { yield(); } 2833 while ( go == 1 ) { do_call( m ); } 2834 } 2835 int __attribute__((noinline)) 2836 do_wait( M & @mutex@ m ) { 2837 go = 1; // continue other thread 2838 BENCH( for ( N ) { @waitfor( do_call, m );@ } ) 2839 go = 0; // stop other thread 2840 sout | result`ns; 2841 } 2842 int main() { 2843 T t; 2844 do_wait( m ); 2845 } 2846 \end{cfa} 2847 \captionof{figure}{\CFA external-scheduling benchmark} 2848 \label{f:ext-sched} 2849 2850 \columnbreak 2851 2852 \vspace*{-16pt} 2853 \captionof{table}{External-scheduling comparison (nanoseconds)} 2854 \label{tab:ext-sched} 2855 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 2856 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2857 \CFA @waitfor@, 1 @monitor@ & 366.7 & 369.5 & 7.52 \\ 2858 \CFA @waitfor@, 2 @monitor@ & 453.6 & 455.8 & 12.38 \\ 2859 \CFA @waitfor@, 4 @monitor@ & 671.6 & 672.4 & 14.16 \\ 2860 \uC @_Accept@ & 336.0 & 335.8 & 3.22 2789 2861 \end{tabular} 2790 2862 \end{multicols} … … 2825 2897 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 2826 2898 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2827 C function & 2 & 2 & 0 \\ 2828 \CFA generator & 2 & 2 & 0 \\ 2829 \CFA Coroutine & 49 & 48.68 & 0.47 \\ 2830 \CFA Thread & 105 & 105.57 & 1.37 \\ 2831 \uC Coroutine & 44 & 44 & 0 \\ 2832 \uC Thread & 100 & 99.29 & 0.96 \\ 2833 Goroutine & 145 & 147.25 & 4.15 \\ 2834 Java Thread & 373.5 & 375.14 & 8.72 \\ 2835 Pthreads Thread & 333.5 & 332.96 & 4.1 2899 C function & 1.8 & 1.8 & 0 \\ 2900 \CFA generator & 2.7 & 2.4 & 0.27 \\ 2901 \CFA Coroutine & 37.8 & 37.7 & 0.22 \\ 2902 \CFA Thread & 93.6 & 93.8 & 1.46 \\ 2903 \uC Coroutine & 52.7 & 52.8 & 0.28 \\ 2904 \uC Thread & 93.4 & 93.7 & 1.04 \\ 2905 Goroutine & 140.0 & 139.7 & 2.93 \\ 2906 Java Thread & 374.0 & 375.8 & 10.38 \\ 2907 % Qthreads Thread & 159.5 & 159.3 & 0.71 \\ 2908 Pthreads Thread & 334.4 & 335.0 & 1.95 \\ 2836 2909 \end{tabular} 2837 2910 \end{multicols} … … 2869 2942 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 2870 2943 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2871 test and test-and-test lock & 26 & 26 & 0 \\ 2872 Pthreads Mutex Lock & 31 & 31.71 & 0.97 \\ 2873 \uC @monitor@ member rtn. & 31 & 31 & 0 \\ 2874 \CFA @mutex@ function, 1 arg. & 46 & 46.68 & 0.93 \\ 2875 \CFA @mutex@ function, 2 arg. & 84 & 85.36 & 1.99 \\ 2876 \CFA @mutex@ function, 4 arg. & 158 & 161 & 4.22 \\ 2877 Java synchronized method & 27.5 & 29.79 & 2.93 2878 \end{tabular} 2879 \end{multicols} 2880 2881 2882 \paragraph{Internal Scheduling} 2883 2884 Internal scheduling is measured using a cycle of two threads signalling and waiting. 2885 Figure~\ref{f:int-sched} shows the code for \CFA, with results in Table~\ref{tab:int-sched}. 2886 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2887 Java scheduling is significantly greater because the benchmark explicitly creates multiple thread in order to prevent the JIT from making the program sequential, \ie removing all locking. 2888 2889 \begin{multicols}{2} 2890 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2891 \begin{cfa} 2892 volatile int go = 0; 2893 @monitor@ M { @condition c;@ } m; 2894 void __attribute__((noinline)) 2895 do_call( M & @mutex@ a1 ) { @signal( c );@ } 2896 thread T {}; 2897 void main( T & this ) { 2898 while ( go == 0 ) { yield(); } 2899 while ( go == 1 ) { do_call( m ); } 2900 } 2901 int __attribute__((noinline)) 2902 do_wait( M & mutex m ) with(m) { 2903 go = 1; // continue other thread 2904 BENCH( for ( N ) { @wait( c );@ } ); 2905 go = 0; // stop other thread 2906 sout | result`ns; 2907 } 2908 int main() { 2909 T t; 2910 do_wait( m ); 2911 } 2912 \end{cfa} 2913 \captionof{figure}{\CFA Internal-scheduling benchmark} 2914 \label{f:int-sched} 2915 2916 \columnbreak 2917 2918 \vspace*{-16pt} 2919 \captionof{table}{Internal-scheduling comparison (nanoseconds)} 2920 \label{tab:int-sched} 2921 \bigskip 2922 2923 \begin{tabular}{@{}r*{3}{D{.}{.}{5.2}}@{}} 2924 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2925 Pthreads Cond. Variable & 6005 & 5681.43 & 835.45 \\ 2926 \uC @signal@ & 324 & 325.54 & 3.02 \\ 2927 \CFA @signal@, 1 @monitor@ & 368.5 & 370.61 & 4.77 \\ 2928 \CFA @signal@, 2 @monitor@ & 467 & 470.5 & 6.79 \\ 2929 \CFA @signal@, 4 @monitor@ & 700.5 & 702.46 & 7.23 \\ 2930 Java @notify@ & 15471 & 172511 & 5689 2931 \end{tabular} 2932 \end{multicols} 2933 2934 2935 \paragraph{External Scheduling} 2936 2937 External scheduling is measured using a cycle of two threads calling and accepting the call using the @waitfor@ statement. 2938 Figure~\ref{f:ext-sched} shows the code for \CFA, with results in Table~\ref{tab:ext-sched}. 2939 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2940 2941 \begin{multicols}{2} 2942 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2943 \vspace*{-16pt} 2944 \begin{cfa} 2945 volatile int go = 0; 2946 @monitor@ M {} m; 2947 thread T {}; 2948 void __attribute__((noinline)) 2949 do_call( M & @mutex@ ) {} 2950 void main( T & ) { 2951 while ( go == 0 ) { yield(); } 2952 while ( go == 1 ) { do_call( m ); } 2953 } 2954 int __attribute__((noinline)) 2955 do_wait( M & @mutex@ m ) { 2956 go = 1; // continue other thread 2957 BENCH( for ( N ) { @waitfor( do_call, m );@ } ) 2958 go = 0; // stop other thread 2959 sout | result`ns; 2960 } 2961 int main() { 2962 T t; 2963 do_wait( m ); 2964 } 2965 \end{cfa} 2966 \captionof{figure}{\CFA external-scheduling benchmark} 2967 \label{f:ext-sched} 2968 2969 \columnbreak 2970 2971 \vspace*{-16pt} 2972 \captionof{table}{External-scheduling comparison (nanoseconds)} 2973 \label{tab:ext-sched} 2974 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 2975 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 2976 \uC @_Accept@ & 358 & 359.11 & 2.53 \\ 2977 \CFA @waitfor@, 1 @monitor@ & 359 & 360.93 & 4.07 \\ 2978 \CFA @waitfor@, 2 @monitor@ & 450 & 449.39 & 6.62 \\ 2979 \CFA @waitfor@, 4 @monitor@ & 652 & 655.64 & 7.73 2944 test and test-and-test lock & 19.1 & 19.0 & 0.36 \\ 2945 \CFA @mutex@ function, 1 arg. & 46.6 & 46.8 & 0.86 \\ 2946 \CFA @mutex@ function, 2 arg. & 84.1 & 85.3 & 1.86 \\ 2947 \CFA @mutex@ function, 4 arg. & 158.6 & 160.7 & 3.07 \\ 2948 \uC @monitor@ member rtn. & 54.0 & 53.7 & 0.83 \\ 2949 Java synchronized method & 27.0 & 27.1 & 0.25 \\ 2950 Pthreads Mutex Lock & 33.6 & 32.7 & 1.12 2980 2951 \end{tabular} 2981 2952 \end{multicols}
Note: See TracChangeset
for help on using the changeset viewer.