Changeset 64188cc for doc/papers/concurrency
- Timestamp:
- Jun 27, 2018, 10:46:07 PM (6 years ago)
- Branches:
- ADT, aaron-thesis, arm-eh, ast-experimental, cleanup-dtors, deferred_resn, demangler, enum, forall-pointer-decay, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr, no_list, persistent-indexer, pthread-emulation, qualifiedEnum
- Children:
- 287da46
- Parents:
- 6d6cf5a
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
doc/papers/concurrency/Paper.tex
r6d6cf5a r64188cc 22 22 \captionsetup{justification=raggedright,singlelinecheck=false} 23 23 \usepackage{dcolumn} % align decimal points in tables 24 \usepackage{capt-of} 24 25 25 26 \hypersetup{breaklinks=true} … … 2158 2159 \begin{cfa} 2159 2160 unsigned int N = 10_000_000; 2160 #define BENCH( run , result ) Time before = getTimeNsec(); run;result = (getTimeNsec() - before) / N;2161 #define BENCH( run ) Time before = getTimeNsec(); run; Duration result = (getTimeNsec() - before) / N; 2161 2162 \end{cfa} 2162 2163 The method used to get time is @clock_gettime( CLOCK_REALTIME )@. … … 2184 2185 void main( C & ) { for ( ;; ) { @suspend();@ } } 2185 2186 int main() { 2186 Duration result;2187 2187 BENCH( 2188 for ( size_t i = 0; i < N; i += 1 ) { @resume( c );@ }, 2189 result 2190 ) 2188 for ( size_t i = 0; i < N; i += 1 ) { @resume( c );@ } ) 2191 2189 sout | result`ns | endl; 2192 2190 } … … 2200 2198 2201 2199 int main() { 2202 Duration result;2203 2200 BENCH( 2204 for ( size_t i = 0; i < N; i += 1 ) { @yield();@ }, 2205 result 2206 ) 2201 for ( size_t i = 0; i < N; i += 1 ) { @yield();@ } ) 2207 2202 sout | result`ns | endl; 2208 2203 } … … 2213 2208 \quad 2214 2209 \subfloat[Thread]{\label{f:ExternalState}\usebox\myboxB} 2215 \caption {\CFA Context-switch benchmark}2210 \captionof{figure}{\CFA context-switch benchmark} 2216 2211 \label{f:ctx-switch} 2217 \end{figure} 2218 2219 \begin{table} 2212 2220 2213 \centering 2221 \caption{Context Switch comparison (nanoseconds)} 2214 2215 \captionof{table}{Context switch comparison (nanoseconds)} 2222 2216 \label{tab:ctx-switch} 2223 2217 \bigskip 2224 2218 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}} 2225 2219 \cline{2-4} … … 2235 2229 \hline 2236 2230 \end{tabular} 2237 \end{table} 2238 2239 2240 \paragraph{Mutual-Exclusion} 2241 2242 Mutual exclusion is measured by entering/leaving a critical section. 2243 For monitors, entering and leaving a monitor routine is measured. 2244 Figure~\ref{f:mutex} shows the code for \CFA with all results in Table~\ref{tab:mutex}. 2245 To put the results in context, the cost of entering a non-inline routine and the cost of acquiring and releasing a @pthread_mutex@ lock is also measured. 2246 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2247 2248 \begin{samepage} 2249 \begin{figure}[!p] 2231 2232 \bigskip 2233 \bigskip 2234 2250 2235 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2251 2236 \begin{cfa} … … 2253 2238 void __attribute__((noinline)) do_call( M & mutex m/*, m2, m3, m4*/ ) {} 2254 2239 int main() { 2255 Duration result; 2256 BENCH( for( size_t i = 0; i < N; i += 1 ) { @do_call( m1/*, m2, m3, m4*/ );@ }, result ) 2240 BENCH( for( size_t i = 0; i < N; i += 1 ) { @do_call( m1/*, m2, m3, m4*/ );@ } ) 2257 2241 sout | result`ns | endl; 2258 2242 } 2259 2243 \end{cfa} 2260 \caption {\CFA benchmark code used to measure mutex routines.}2244 \captionof{figure}{\CFA acquire/release mutex benchmark} 2261 2245 \label{f:mutex} 2262 \end{figure} 2263 2264 \begin{table}[!p] 2246 2265 2247 \centering 2266 \caption{Mutex routine comparison (nanoseconds)} 2248 2249 \captionof{table}{Mutex comparison (nanoseconds)} 2267 2250 \label{tab:mutex} 2251 \bigskip 2268 2252 2269 2253 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}} … … 2281 2265 \hline 2282 2266 \end{tabular} 2283 \end{table} 2284 \end{samepage} 2267 \end{figure} 2268 2269 2270 \paragraph{Mutual-Exclusion} 2271 2272 Mutual exclusion is measured by entering/leaving a critical section. 2273 For monitors, entering and leaving a monitor routine is measured. 2274 Figure~\ref{f:mutex} shows the code for \CFA with all results in Table~\ref{tab:mutex}. 2275 To put the results in context, the cost of entering a non-inline routine and the cost of acquiring and releasing a @pthread_mutex@ lock is also measured. 2276 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2285 2277 2286 2278 … … 2291 2283 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2292 2284 2293 \begin{samepage} 2294 \begin{figure}[!p] 2285 \begin{figure} 2295 2286 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2296 2287 \begin{cfa} … … 2305 2296 } 2306 2297 int __attribute__((noinline)) do_wait( M & mutex m ) { 2307 Duration result;2308 2298 go = 1; // continue other thread 2309 BENCH( for ( size_t i = 0; i < N; i += 1 ) { @wait( c );@ } , result);2299 BENCH( for ( size_t i = 0; i < N; i += 1 ) { @wait( c );@ } ); 2310 2300 go = 0; // stop other thread 2311 2301 sout | result`ns | endl; … … 2316 2306 } 2317 2307 \end{cfa} 2318 \caption {Internal scheduling benchmark}2308 \captionof{figure}{\CFA Internal scheduling benchmark} 2319 2309 \label{f:int-sched} 2320 \end{figure} 2321 2322 \begin{table}[!p] 2310 2323 2311 \centering 2324 \caption {Internal scheduling comparison (nanoseconds)}2312 \captionof{table}{Internal scheduling comparison (nanoseconds)} 2325 2313 \label{tab:int-sched} 2314 \bigskip 2315 2326 2316 \begin{tabular}{|r|*{3}{D{.}{.}{5.2}|}} 2327 2317 \cline{2-4} … … 2336 2326 \hline 2337 2327 \end{tabular} 2338 \end{table} 2339 \end{samepage} 2328 \end{figure} 2340 2329 2341 2330 … … 2346 2335 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 2347 2336 2348 \begin{samepage}2349 2337 \begin{figure} 2350 2338 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} … … 2359 2347 } 2360 2348 int __attribute__((noinline)) do_wait( M & mutex m ) { 2361 Duration result; 2362 go = 1; BENCH( for ( size_t i = 0; i < N; i += 1 ) { @waitfor( do_call, m );@ }, result ) go = 0; 2349 go = 1; // continue other thread 2350 BENCH( for ( size_t i = 0; i < N; i += 1 ) { @waitfor( do_call, m );@ } ) 2351 go = 0; // stop other thread 2363 2352 sout | result`ns | endl; 2364 2353 } … … 2368 2357 } 2369 2358 \end{cfa} 2370 \caption {Benchmark code for external scheduling}2359 \captionof{figure}{\CFA external scheduling benchmark} 2371 2360 \label{f:ext-sched} 2372 \end{figure} 2373 2374 \begin{table} 2361 2375 2362 \centering 2376 \caption{External scheduling comparison (nanoseconds)} 2363 2364 \captionof{table}{External scheduling comparison (nanoseconds)} 2377 2365 \label{tab:ext-sched} 2366 \bigskip 2378 2367 \begin{tabular}{|r|*{3}{D{.}{.}{3.2}|}} 2379 2368 \cline{2-4} 2380 2369 \multicolumn{1}{c|}{} & \multicolumn{1}{c|}{Median} &\multicolumn{1}{c|}{Average} & \multicolumn{1}{c|}{Std Dev} \\ 2381 2370 \hline 2382 \uC @ Accept@ & 350 & 350.61 & 3.11 \\2371 \uC @_Accept@ & 350 & 350.61 & 3.11 \\ 2383 2372 \CFA @waitfor@, 1 @monitor@ & 358.5 & 358.36 & 3.82 \\ 2384 2373 \CFA @waitfor@, 2 @monitor@ & 422 & 426.79 & 7.95 \\ … … 2386 2375 \hline 2387 2376 \end{tabular} 2388 \end{table} 2389 \end{samepage} 2390 2391 2392 \paragraph{Object Creation} 2393 2394 Object creation is measured by creating/deleting the specific kind of concurrent object. 2395 Figure~\ref{f:creation} shows the code for \CFA, with results in Table~\ref{tab:creation}. 2396 The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine to force stack creation, the creation cost is artificially low. 2397 2398 \begin{figure} 2399 \centering 2377 2378 \bigskip 2379 \medskip 2380 2400 2381 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 2401 2382 \begin{cfa} … … 2403 2384 void main( MyThread & ) {} 2404 2385 int main() { 2405 Duration result; 2406 BENCH( for ( size_t i = 0; i < N; i += 1 ) { @MyThread m;@ }, result ) 2386 BENCH( for ( size_t i = 0; i < N; i += 1 ) { @MyThread m;@ } ) 2407 2387 sout | result`ns | endl; 2408 2388 } 2409 2389 \end{cfa} 2410 \caption {Benchmark code for \CFA object creation}2390 \captionof{figure}{\CFA object creation benchmark} 2411 2391 \label{f:creation} 2412 \end{figure} 2413 2414 \begin{table} 2392 2415 2393 \centering 2416 \caption{Creation comparison (nanoseconds)} 2394 2395 \captionof{table}{Creation comparison (nanoseconds)} 2417 2396 \label{tab:creation} 2397 \bigskip 2398 2418 2399 \begin{tabular}{|r|*{3}{D{.}{.}{5.2}|}} 2419 2400 \cline{2-4} … … 2430 2411 \hline 2431 2412 \end{tabular} 2432 \end{table} 2413 \end{figure} 2414 2415 2416 \paragraph{Object Creation} 2417 2418 Object creation is measured by creating/deleting the specific kind of concurrent object. 2419 Figure~\ref{f:creation} shows the code for \CFA, with results in Table~\ref{tab:creation}. 2420 The only note here is that the call stacks of \CFA coroutines are lazily created, therefore without priming the coroutine to force stack creation, the creation cost is artificially low. 2433 2421 2434 2422
Note: See TracChangeset
for help on using the changeset viewer.