- Timestamp:
- Oct 7, 2020, 4:31:43 PM (5 years ago)
- Branches:
- ADT, arm-eh, ast-experimental, enum, forall-pointer-decay, jacob/cs343-translation, master, new-ast-unique-expr, pthread-emulation, qualifiedEnum
- Children:
- 848439f
- Parents:
- ae2c27a (diff), 597c5d18 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the(diff)links above to see all the changes relative to each parent. - Location:
- doc
- Files:
-
- 33 added
- 1 deleted
- 13 edited
- 23 moved
-
LaTeXmacros/common.tex (modified) (4 diffs)
-
LaTeXmacros/lstlang.sty (modified) (3 diffs)
-
bibliography/pl.bib (modified) (2 diffs)
-
papers/concurrency/Paper.tex (modified) (19 diffs)
-
papers/concurrency/annex/local.bib (modified) (1 diff)
-
papers/concurrency/mail2 (modified) (1 diff)
-
papers/concurrency/response3 (added)
-
proposals/ZeroCostPreemption.md (added)
-
proposals/function_type_change.md (added)
-
refrat/refrat.tex (modified) (3 diffs)
-
theses/andrew_beach_MMath/glossaries.tex (added)
-
theses/andrew_beach_MMath/thesis.tex (modified) (1 diff)
-
theses/fangren_yu_COOP_S20/Makefile (added)
-
theses/fangren_yu_COOP_S20/Report.tex (added)
-
theses/fangren_yu_COOP_S20/cfa_developer_reference.pdf (added)
-
theses/fangren_yu_COOP_S20/figures/DeepNodeSharing.fig (added)
-
theses/fangren_yu_COOP_S20/figures/DeepNodeSharing.fig.bak (added)
-
theses/thierry_delisle_PhD/.gitignore (modified) (1 diff)
-
theses/thierry_delisle_PhD/code/readQ_example/Makefile (added)
-
theses/thierry_delisle_PhD/code/readQ_example/proto-gui/main.cpp (added)
-
theses/thierry_delisle_PhD/code/readQ_example/thrdlib/Makefile (added)
-
theses/thierry_delisle_PhD/code/readQ_example/thrdlib/cforall.cpp (added)
-
theses/thierry_delisle_PhD/code/readQ_example/thrdlib/fibre.cpp (added)
-
theses/thierry_delisle_PhD/code/readQ_example/thrdlib/pthread.cpp (added)
-
theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.cpp (added)
-
theses/thierry_delisle_PhD/code/readQ_example/thrdlib/thread.hpp (added)
-
theses/thierry_delisle_PhD/code/readyQ_proto/Makefile (moved) (moved from doc/theses/thierry_delisle_PhD/code/Makefile )
-
theses/thierry_delisle_PhD/code/readyQ_proto/assert.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/assert.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/bitbench/select.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/bitbench/select.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/bts.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/bts.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/bts_test.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/bts_test.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/links.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/links.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/prefetch.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/prefetch.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/process.sh (moved) (moved from doc/theses/thierry_delisle_PhD/code/process.sh )
-
theses/thierry_delisle_PhD/code/readyQ_proto/processor.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/processor.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/processor_list.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/processor_list.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_fast.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/processor_list_fast.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/processor_list_good.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/processor_list_good.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/randbit.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/randbit.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/relaxed_list.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/relaxed_list.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/relaxed_list_layout.cpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/relaxed_list_layout.cpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/runperf.sh (moved) (moved from doc/theses/thierry_delisle_PhD/code/runperf.sh )
-
theses/thierry_delisle_PhD/code/readyQ_proto/scale.sh (moved) (moved from doc/theses/thierry_delisle_PhD/code/scale.sh )
-
theses/thierry_delisle_PhD/code/readyQ_proto/snzi-packed.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/snzi-packed.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/snzi.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/snzi.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/snzm.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/snzm.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/utils.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/utils.hpp )
-
theses/thierry_delisle_PhD/code/readyQ_proto/work_stealing.hpp (moved) (moved from doc/theses/thierry_delisle_PhD/code/work_stealing.hpp )
-
theses/thierry_delisle_PhD/comp_II/comp_II.tex (modified) (25 diffs)
-
theses/thierry_delisle_PhD/comp_II/img/system.fig (modified) (4 diffs)
-
theses/thierry_delisle_PhD/comp_II/presentation.pdf (deleted)
-
theses/thierry_delisle_PhD/thesis/Makefile (added)
-
theses/thierry_delisle_PhD/thesis/fig/base.fig (added)
-
theses/thierry_delisle_PhD/thesis/fig/empty.fig (added)
-
theses/thierry_delisle_PhD/thesis/fig/emptybit.fig (added)
-
theses/thierry_delisle_PhD/thesis/fig/emptytls.fig (added)
-
theses/thierry_delisle_PhD/thesis/fig/emptytree.fig (added)
-
theses/thierry_delisle_PhD/thesis/fig/resize.fig (added)
-
theses/thierry_delisle_PhD/thesis/fig/system.fig (added)
-
theses/thierry_delisle_PhD/thesis/glossary.tex (added)
-
theses/thierry_delisle_PhD/thesis/text/core.tex (added)
-
theses/thierry_delisle_PhD/thesis/text/front.tex (added)
-
theses/thierry_delisle_PhD/thesis/text/intro.tex (added)
-
theses/thierry_delisle_PhD/thesis/text/io.tex (added)
-
theses/thierry_delisle_PhD/thesis/text/practice.tex (added)
-
theses/thierry_delisle_PhD/thesis/text/runtime.tex (added)
-
theses/thierry_delisle_PhD/thesis/thesis.tex (added)
-
user/Makefile (modified) (1 diff)
-
user/user.tex (modified) (17 diffs)
Legend:
- Unmodified
- Added
- Removed
-
doc/LaTeXmacros/common.tex
rae2c27a rc76bd34 11 11 %% Created On : Sat Apr 9 10:06:17 2016 12 12 %% Last Modified By : Peter A. Buhr 13 %% Last Modified On : Fri Sep 4 13:56:52202014 %% Update Count : 38313 %% Last Modified On : Mon Oct 5 09:34:46 2020 14 %% Update Count : 464 15 15 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 16 16 … … 55 55 \newlength{\parindentlnth} 56 56 \setlength{\parindentlnth}{\parindent} 57 58 \newcommand{\LstBasicStyle}[1]{{\lst@basicstyle{#1}}}59 \newcommand{\LstKeywordStyle}[1]{{\lst@basicstyle{\lst@keywordstyle{#1}}}}60 \newcommand{\LstCommentStyle}[1]{{\lst@basicstyle{\lst@commentstyle{#1}}}}61 62 \newlength{\gcolumnposn} % temporary hack because lstlisting does not handle tabs correctly63 \newlength{\columnposn}64 \setlength{\gcolumnposn}{2.75in}65 \setlength{\columnposn}{\gcolumnposn}66 \newcommand{\C}[2][\@empty]{\ifx#1\@empty\else\global\setlength{\columnposn}{#1}\global\columnposn=\columnposn\fi\hfill\makebox[\textwidth-\columnposn][l]{\lst@basicstyle{\LstCommentStyle{#2}}}}67 \newcommand{\CRT}{\global\columnposn=\gcolumnposn}68 69 % allow escape sequence in lstinline70 %\usepackage{etoolbox}71 %\patchcmd{\lsthk@TextStyle}{\let\lst@DefEsc\@empty}{}{}{\errmessage{failed to patch}}72 57 73 58 \usepackage{pslatex} % reduce size of san serif font … … 244 229 \usepackage{listings} % format program code 245 230 \usepackage{lstlang} 246 247 \newcommand{\CFADefaults}{% 231 \makeatletter 232 233 \newcommand{\LstBasicStyle}[1]{{\lst@basicstyle{#1}}} 234 \newcommand{\LstKeywordStyle}[1]{{\lst@basicstyle{\lst@keywordstyle{#1}}}} 235 \newcommand{\LstCommentStyle}[1]{{\lst@basicstyle{\lst@commentstyle{#1}}}} 236 237 \newlength{\gcolumnposn} % temporary hack because lstlisting does not handle tabs correctly 238 \newlength{\columnposn} 239 \setlength{\gcolumnposn}{2.75in} 240 \setlength{\columnposn}{\gcolumnposn} 241 \newcommand{\C}[2][\@empty]{\ifx#1\@empty\else\global\setlength{\columnposn}{#1}\global\columnposn=\columnposn\fi\hfill\makebox[\textwidth-\columnposn][l]{\lst@basicstyle{\LstCommentStyle{#2}}}} 242 \newcommand{\CRT}{\global\columnposn=\gcolumnposn} 243 244 % allow escape sequence in lstinline 245 %\usepackage{etoolbox} 246 %\patchcmd{\lsthk@TextStyle}{\let\lst@DefEsc\@empty}{}{}{\errmessage{failed to patch}} 247 248 % allow adding to lst literate 249 \def\addToLiterate#1{\protect\edef\lst@literate{\unexpanded\expandafter{\lst@literate}\unexpanded{#1}}} 250 \lst@Key{add to literate}{}{\addToLiterate{#1}} 251 \makeatother 252 253 \newcommand{\CFAStyle}{% 248 254 \lstset{ 249 language=CFA,250 255 columns=fullflexible, 251 256 basicstyle=\linespread{0.9}\sf, % reduce line spacing and use sanserif font … … 262 267 belowskip=3pt, 263 268 % replace/adjust listing characters that look bad in sanserif 264 literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0. 8ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptscriptstyle\land\,$}}1269 literate={-}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.75ex}{0.1ex}}}}1 {^}{\raisebox{0.6ex}{$\scriptscriptstyle\land\,$}}1 265 270 {~}{\raisebox{0.3ex}{$\scriptstyle\sim\,$}}1 {`}{\ttfamily\upshape\hspace*{-0.1ex}`}1 266 271 {<-}{$\leftarrow$}2 {=>}{$\Rightarrow$}2 {->}{\makebox[1ex][c]{\raisebox{0.4ex}{\rule{0.8ex}{0.075ex}}}\kern-0.2ex\textgreater}2, 272 }% lstset 273 }% CFAStyle 274 275 \ifdefined\CFALatin% extra Latin-1 escape characters 276 \lstnewenvironment{cfa}[1][]{ 277 \lstset{ 278 language=CFA, 267 279 moredelim=**[is][\color{red}]{®}{®}, % red highlighting ®...® (registered trademark symbol) emacs: C-q M-. 268 280 moredelim=**[is][\color{blue}]{ß}{ß}, % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_ 269 281 moredelim=**[is][\color{OliveGreen}]{¢}{¢}, % green highlighting ¢...¢ (cent symbol) emacs: C-q M-" 270 282 moredelim=[is][\lstset{keywords={}}]{¶}{¶}, % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^ 283 % replace/adjust listing characters that look bad in sanserif 284 add to literate={`}{\ttfamily\upshape\hspace*{-0.1ex}`}1 271 285 }% lstset 272 }% CFADefaults 273 \newcommand{\CFAStyle}{% 274 \CFADefaults 286 \lstset{#1} 287 }{} 275 288 % inline code ©...© (copyright symbol) emacs: C-q M-) 276 289 \lstMakeShortInline© % single-character for \lstinline 277 }% CFAStyle 278 279 \lstnewenvironment{cfa}[1][] 280 {\CFADefaults\lstset{#1}} 281 {} 290 \else% regular ASCI characters 291 \lstnewenvironment{cfa}[1][]{ 292 \lstset{ 293 language=CFA, 294 escapechar=\$, % LaTeX escape in CFA code 295 moredelim=**[is][\color{red}]{@}{@}, % red highlighting @...@ 296 }% lstset 297 \lstset{#1} 298 }{} 299 % inline code @...@ (at symbol) 300 \lstMakeShortInline@ % single-character for \lstinline 301 \fi% 282 302 283 303 % Local Variables: % -
doc/LaTeXmacros/lstlang.sty
rae2c27a rc76bd34 8 8 %% Created On : Sat May 13 16:34:42 2017 9 9 %% Last Modified By : Peter A. Buhr 10 %% Last Modified On : Tue Jan 8 14:40:33 201911 %% Update Count : 2 110 %% Last Modified On : Wed Sep 23 22:40:04 2020 11 %% Update Count : 24 12 12 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 13 13 … … 115 115 auto, _Bool, catch, catchResume, choose, _Complex, __complex, __complex__, __const, __const__, 116 116 coroutine, disable, dtype, enable, exception, __extension__, fallthrough, fallthru, finally, 117 __float80, float80, __float128, float128, forall, ftype, _Generic, _Imaginary, __imag, __imag__,117 __float80, float80, __float128, float128, forall, ftype, generator, _Generic, _Imaginary, __imag, __imag__, 118 118 inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or, 119 otype, restrict, __restrict, __restrict__, __signed, __signed__, _Static_assert, thread,119 otype, restrict, __restrict, __restrict__, __signed, __signed__, _Static_assert, suspend, thread, 120 120 _Thread_local, throw, throwResume, timeout, trait, try, ttype, typeof, __typeof, __typeof__, 121 121 virtual, __volatile, __volatile__, waitfor, when, with, zero_t, … … 125 125 126 126 % C++ programming language 127 \lstdefinelanguage{C++}[ANSI]{C++}{} 127 \lstdefinelanguage{C++}[ANSI]{C++}{ 128 morekeywords={nullptr,} 129 } 128 130 129 131 % uC++ programming language, based on ANSI C++ -
doc/bibliography/pl.bib
rae2c27a rc76bd34 1005 1005 key = {Cforall Benchmarks}, 1006 1006 author = {{\textsf{C}{$\mathbf{\forall}$} Benchmarks}}, 1007 howpublished= {\href{https:// plg.uwaterloo.ca/~cforall/doc/CforallConcurrentBenchmarks.tar}{https://\-plg.uwaterloo.ca/\-$\sim$cforall/\-doc/\-CforallConcurrentBenchmarks.tar}},1007 howpublished= {\href{https://github.com/cforall/ConcurrentBenchmarks_SPE20}{https://\-github.com/\-cforall/\-ConcurrentBenchmarks\_SPE20}}, 1008 1008 } 1009 1009 … … 1973 1973 title = {Cooperating Sequential Processes}, 1974 1974 institution = {Technological University}, 1975 address = {Eindhoven, Neth erlands},1975 address = {Eindhoven, Neth.}, 1976 1976 year = 1965, 1977 1977 note = {Reprinted in \cite{Genuys68} pp. 43--112.} -
doc/papers/concurrency/Paper.tex
rae2c27a rc76bd34 224 224 {} 225 225 \lstnewenvironment{C++}[1][] % use C++ style 226 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{`}{`} ,#1}\lstset{#1}}226 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}} 227 227 {} 228 228 \lstnewenvironment{uC++}[1][] 229 {\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{`}{`} ,#1}\lstset{#1}}229 {\lstset{language=uC++,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}} 230 230 {} 231 231 \lstnewenvironment{Go}[1][] 232 {\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{`}{`} ,#1}\lstset{#1}}232 {\lstset{language=Golang,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}} 233 233 {} 234 234 \lstnewenvironment{python}[1][] 235 {\lstset{language=python,moredelim=**[is][\protect\color{red}]{`}{`} ,#1}\lstset{#1}}235 {\lstset{language=python,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}} 236 236 {} 237 237 \lstnewenvironment{java}[1][] 238 {\lstset{language=java,moredelim=**[is][\protect\color{red}]{`}{`} ,#1}\lstset{#1}}238 {\lstset{language=java,moredelim=**[is][\protect\color{red}]{`}{`}}\lstset{#1}} 239 239 {} 240 240 … … 284 284 285 285 \begin{document} 286 \linenumbers % comment out to turn off line numbering286 %\linenumbers % comment out to turn off line numbering 287 287 288 288 \maketitle … … 450 450 \hline 451 451 stateful & thread & \multicolumn{1}{c|}{No} & \multicolumn{1}{c}{Yes} \\ 452 \hline 453 \hline 452 \hline 453 \hline 454 454 No & No & \textbf{1}\ \ \ @struct@ & \textbf{2}\ \ \ @mutex@ @struct@ \\ 455 \hline 455 \hline 456 456 Yes (stackless) & No & \textbf{3}\ \ \ @generator@ & \textbf{4}\ \ \ @mutex@ @generator@ \\ 457 \hline 457 \hline 458 458 Yes (stackful) & No & \textbf{5}\ \ \ @coroutine@ & \textbf{6}\ \ \ @mutex@ @coroutine@ \\ 459 \hline 459 \hline 460 460 No & Yes & \textbf{7}\ \ \ {\color{red}rejected} & \textbf{8}\ \ \ {\color{red}rejected} \\ 461 \hline 461 \hline 462 462 Yes (stackless) & Yes & \textbf{9}\ \ \ {\color{red}rejected} & \textbf{10}\ \ \ {\color{red}rejected} \\ 463 \hline 463 \hline 464 464 Yes (stackful) & Yes & \textbf{11}\ \ \ @thread@ & \textbf{12}\ \ @mutex@ @thread@ \\ 465 465 \end{tabular} … … 2896 2896 \label{s:RuntimeStructureCluster} 2897 2897 2898 A \newterm{cluster} is a collection of user and kernel threads, where the kernel threads run the user threads from the cluster's ready queue, and the operating system runs the kernel threads on the processors from its ready queue .2898 A \newterm{cluster} is a collection of user and kernel threads, where the kernel threads run the user threads from the cluster's ready queue, and the operating system runs the kernel threads on the processors from its ready queue~\cite{Buhr90a}. 2899 2899 The term \newterm{virtual processor} is introduced as a synonym for kernel thread to disambiguate between user and kernel thread. 2900 2900 From the language perspective, a virtual processor is an actual processor (core). … … 2992 2992 \end{cfa} 2993 2993 where CPU time in nanoseconds is from the appropriate language clock. 2994 Each benchmark is performed @N@ times, where @N@ is selected so the benchmark runs in the range of 2--20 seconds for the specific programming language. 2994 Each benchmark is performed @N@ times, where @N@ is selected so the benchmark runs in the range of 2--20 seconds for the specific programming language; 2995 each @N@ appears after the experiment name in the following tables. 2995 2996 The total time is divided by @N@ to obtain the average time for a benchmark. 2996 2997 Each benchmark experiment is run 13 times and the average appears in the table. 2998 For languages with a runtime JIT (Java, Node.js, Python), a single half-hour long experiment is run to check stability; 2999 all long-experiment results are statistically equivalent, \ie median/average/standard-deviation correlate with the short-experiment results, indicating the short experiments reached a steady state. 2997 3000 All omitted tests for other languages are functionally identical to the \CFA tests and available online~\cite{CforallConcurrentBenchmarks}. 2998 % tar --exclude-ignore=exclude -cvhf benchmark.tar benchmark2999 % cp -p benchmark.tar /u/cforall/public_html/doc/concurrent_benchmark.tar3000 3001 3001 3002 \paragraph{Creation} … … 3006 3007 3007 3008 \begin{multicols}{2} 3008 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 3009 \begin{cfa} 3010 @coroutine@ MyCoroutine {}; 3009 \begin{cfa}[xleftmargin=0pt] 3010 `coroutine` MyCoroutine {}; 3011 3011 void ?{}( MyCoroutine & this ) { 3012 3012 #ifdef EAGER … … 3016 3016 void main( MyCoroutine & ) {} 3017 3017 int main() { 3018 BENCH( for ( N ) { @MyCoroutine c;@} )3018 BENCH( for ( N ) { `MyCoroutine c;` } ) 3019 3019 sout | result; 3020 3020 } … … 3030 3030 3031 3031 \begin{tabular}[t]{@{}r*{3}{D{.}{.}{5.2}}@{}} 3032 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3033 \CFA generator & 0.6 & 0.6 & 0.0 \\ 3034 \CFA coroutine lazy & 13.4 & 13.1 & 0.5 \\ 3035 \CFA coroutine eager & 144.7 & 143.9 & 1.5 \\ 3036 \CFA thread & 466.4 & 468.0 & 11.3 \\ 3037 \uC coroutine & 155.6 & 155.7 & 1.7 \\ 3038 \uC thread & 523.4 & 523.9 & 7.7 \\ 3039 Python generator & 123.2 & 124.3 & 4.1 \\ 3040 Node.js generator & 33.4 & 33.5 & 0.3 \\ 3041 Goroutine thread & 751.0 & 750.5 & 3.1 \\ 3042 Rust tokio thread & 1860.0 & 1881.1 & 37.6 \\ 3043 Rust thread & 53801.0 & 53896.8 & 274.9 \\ 3044 Java thread & 120274.0 & 120722.9 & 2356.7 \\ 3045 Pthreads thread & 31465.5 & 31419.5 & 140.4 3032 \multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3033 \CFA generator (1B) & 0.6 & 0.6 & 0.0 \\ 3034 \CFA coroutine lazy (100M) & 13.4 & 13.1 & 0.5 \\ 3035 \CFA coroutine eager (10M) & 144.7 & 143.9 & 1.5 \\ 3036 \CFA thread (10M) & 466.4 & 468.0 & 11.3 \\ 3037 \uC coroutine (10M) & 155.6 & 155.7 & 1.7 \\ 3038 \uC thread (10M) & 523.4 & 523.9 & 7.7 \\ 3039 Python generator (10M) & 123.2 & 124.3 & 4.1 \\ 3040 Node.js generator (10M) & 33.4 & 33.5 & 0.3 \\ 3041 Goroutine thread (10M) & 751.0 & 750.5 & 3.1 \\ 3042 Rust tokio thread (10M) & 1860.0 & 1881.1 & 37.6 \\ 3043 Rust thread (250K) & 53801.0 & 53896.8 & 274.9 \\ 3044 Java thread (250K) & 119256.0 & 119679.2 & 2244.0 \\ 3045 % Java thread (1 000 000) & 123100.0 & 123052.5 & 751.6 \\ 3046 Pthreads thread (250K) & 31465.5 & 31419.5 & 140.4 3046 3047 \end{tabular} 3047 3048 \end{multicols} … … 3052 3053 Internal scheduling is measured using a cycle of two threads signalling and waiting. 3053 3054 Figure~\ref{f:schedint} shows the code for \CFA, with results in Table~\ref{t:schedint}. 3054 Note, the incremental cost of bulk acquire for \CFA, which is largely a fixed cost for small numbers of mutex objects. 3055 Java scheduling is significantly greater because the benchmark explicitly creates multiple threads in order to prevent the JIT from making the program sequential, \ie removing all locking. 3055 Note, the \CFA incremental cost for bulk acquire is a fixed cost for small numbers of mutex objects. 3056 User-level threading has one kernel thread, eliminating contention between the threads (direct handoff of the kernel thread). 3057 Kernel-level threading has two kernel threads allowing some contention. 3056 3058 3057 3059 \begin{multicols}{2} 3058 \ lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}3059 \begin{cfa} 3060 \setlength{\tabcolsep}{3pt} 3061 \begin{cfa}[xleftmargin=0pt] 3060 3062 volatile int go = 0; 3061 @condition c;@ 3062 @monitor@M {} m1/*, m2, m3, m4*/;3063 void call( M & @mutex p1/*, p2, p3, p4*/@) {3064 @signal( c );@3065 } 3066 void wait( M & @mutex p1/*, p2, p3, p4*/@) {3063 `condition c;` 3064 `monitor` M {} m1/*, m2, m3, m4*/; 3065 void call( M & `mutex p1/*, p2, p3, p4*/` ) { 3066 `signal( c );` 3067 } 3068 void wait( M & `mutex p1/*, p2, p3, p4*/` ) { 3067 3069 go = 1; // continue other thread 3068 for ( N ) { @wait( c );@} );3070 for ( N ) { `wait( c );` } ); 3069 3071 } 3070 3072 thread T {}; … … 3091 3093 3092 3094 \begin{tabular}{@{}r*{3}{D{.}{.}{5.2}}@{}} 3093 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3094 \CFA @signal@, 1 monitor & 364.4 & 364.2 & 4.4 \\ 3095 \CFA @signal@, 2 monitor & 484.4 & 483.9 & 8.8 \\ 3096 \CFA @signal@, 4 monitor & 709.1 & 707.7 & 15.0 \\ 3097 \uC @signal@ monitor & 328.3 & 327.4 & 2.4 \\ 3098 Rust cond. variable & 7514.0 & 7437.4 & 397.2 \\ 3099 Java @notify@ monitor & 9623.0 & 9654.6 & 236.2 \\ 3100 Pthreads cond. variable & 5553.7 & 5576.1 & 345.6 3095 \multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} & \multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3096 \CFA @signal@, 1 monitor (10M) & 364.4 & 364.2 & 4.4 \\ 3097 \CFA @signal@, 2 monitor (10M) & 484.4 & 483.9 & 8.8 \\ 3098 \CFA @signal@, 4 monitor (10M) & 709.1 & 707.7 & 15.0 \\ 3099 \uC @signal@ monitor (10M) & 328.3 & 327.4 & 2.4 \\ 3100 Rust cond. variable (1M) & 7514.0 & 7437.4 & 397.2 \\ 3101 Java @notify@ monitor (1M) & 8717.0 & 8774.1 & 471.8 \\ 3102 % Java @notify@ monitor (100 000 000) & 8634.0 & 8683.5 & 330.5 \\ 3103 Pthreads cond. variable (1M) & 5553.7 & 5576.1 & 345.6 3101 3104 \end{tabular} 3102 3105 \end{multicols} … … 3107 3110 External scheduling is measured using a cycle of two threads calling and accepting the call using the @waitfor@ statement. 3108 3111 Figure~\ref{f:schedext} shows the code for \CFA with results in Table~\ref{t:schedext}. 3109 Note, the incremental cost of bulk acquire for \CFA, which is largelya fixed cost for small numbers of mutex objects.3112 Note, the \CFA incremental cost for bulk acquire is a fixed cost for small numbers of mutex objects. 3110 3113 3111 3114 \begin{multicols}{2} 3112 \ lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}3115 \setlength{\tabcolsep}{5pt} 3113 3116 \vspace*{-16pt} 3114 \begin{cfa} 3115 @monitor@M {} m1/*, m2, m3, m4*/;3116 void call( M & @mutex p1/*, p2, p3, p4*/@) {}3117 void wait( M & @mutex p1/*, p2, p3, p4*/@) {3118 for ( N ) { @waitfor( call : p1/*, p2, p3, p4*/ );@}3117 \begin{cfa}[xleftmargin=0pt] 3118 `monitor` M {} m1/*, m2, m3, m4*/; 3119 void call( M & `mutex p1/*, p2, p3, p4*/` ) {} 3120 void wait( M & `mutex p1/*, p2, p3, p4*/` ) { 3121 for ( N ) { `waitfor( call : p1/*, p2, p3, p4*/ );` } 3119 3122 } 3120 3123 thread T {}; … … 3133 3136 \columnbreak 3134 3137 3135 \vspace*{-1 6pt}3138 \vspace*{-18pt} 3136 3139 \captionof{table}{External-scheduling comparison (nanoseconds)} 3137 3140 \label{t:schedext} 3138 3141 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 3139 \multicolumn{1}{@{} c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\3140 \CFA @waitfor@, 1 monitor & 367.1 & 365.3 & 5.0 \\3141 \CFA @waitfor@, 2 monitor & 463.0 & 464.6 & 7.1 \\3142 \CFA @waitfor@, 4 monitor & 689.6 & 696.2 & 21.5 \\3143 \uC \lstinline[language=uC++]|_Accept| monitor & 328.2 & 329.1 & 3.4 \\3144 Go \lstinline[language=Golang]|select| channel & 365.0 & 365.5 & 1.23142 \multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3143 \CFA @waitfor@, 1 monitor (10M) & 367.1 & 365.3 & 5.0 \\ 3144 \CFA @waitfor@, 2 monitor (10M) & 463.0 & 464.6 & 7.1 \\ 3145 \CFA @waitfor@, 4 monitor (10M) & 689.6 & 696.2 & 21.5 \\ 3146 \uC \lstinline[language=uC++]|_Accept| monitor (10M) & 328.2 & 329.1 & 3.4 \\ 3147 Go \lstinline[language=Golang]|select| channel (10M) & 365.0 & 365.5 & 1.2 3145 3148 \end{tabular} 3146 3149 \end{multicols} … … 3155 3158 3156 3159 \begin{multicols}{2} 3157 \ lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}}3158 \begin{cfa} 3159 @monitor@M {} m1/*, m2, m3, m4*/;3160 call( M & @mutex p1/*, p2, p3, p4*/@) {}3160 \setlength{\tabcolsep}{3pt} 3161 \begin{cfa}[xleftmargin=0pt] 3162 `monitor` M {} m1/*, m2, m3, m4*/; 3163 call( M & `mutex p1/*, p2, p3, p4*/` ) {} 3161 3164 int main() { 3162 3165 BENCH( for( N ) call( m1/*, m2, m3, m4*/ ); ) … … 3173 3176 \label{t:mutex} 3174 3177 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 3175 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3176 test-and-test-set lock & 19.1 & 18.9 & 0.4 \\ 3177 \CFA @mutex@ function, 1 arg. & 48.3 & 47.8 & 0.9 \\ 3178 \CFA @mutex@ function, 2 arg. & 86.7 & 87.6 & 1.9 \\ 3179 \CFA @mutex@ function, 4 arg. & 173.4 & 169.4 & 5.9 \\ 3180 \uC @monitor@ member rtn. & 54.8 & 54.8 & 0.1 \\ 3181 Goroutine mutex lock & 34.0 & 34.0 & 0.0 \\ 3182 Rust mutex lock & 33.0 & 33.2 & 0.8 \\ 3183 Java synchronized method & 31.0 & 31.0 & 0.0 \\ 3184 Pthreads mutex Lock & 31.0 & 31.1 & 0.4 3178 \multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3179 test-and-test-set lock (50M) & 19.1 & 18.9 & 0.4 \\ 3180 \CFA @mutex@ function, 1 arg. (50M) & 48.3 & 47.8 & 0.9 \\ 3181 \CFA @mutex@ function, 2 arg. (50M) & 86.7 & 87.6 & 1.9 \\ 3182 \CFA @mutex@ function, 4 arg. (50M) & 173.4 & 169.4 & 5.9 \\ 3183 \uC @monitor@ member rtn. (50M) & 54.8 & 54.8 & 0.1 \\ 3184 Goroutine mutex lock (50M) & 34.0 & 34.0 & 0.0 \\ 3185 Rust mutex lock (50M) & 33.0 & 33.2 & 0.8 \\ 3186 Java synchronized method (50M) & 31.0 & 30.9 & 0.5 \\ 3187 % Java synchronized method (10 000 000 000) & 31.0 & 30.2 & 0.9 \\ 3188 Pthreads mutex Lock (50M) & 31.0 & 31.1 & 0.4 3185 3189 \end{tabular} 3186 3190 \end{multicols} … … 3201 3205 % To: "Peter A. Buhr" <pabuhr@plg2.cs.uwaterloo.ca> 3202 3206 % Date: Fri, 24 Jan 2020 13:49:18 -0500 3203 % 3207 % 3204 3208 % I can also verify that the previous version, which just tied a bunch of promises together, *does not* go back to the 3205 3209 % event loop at all in the current version of Node. Presumably they're taking advantage of the fact that the ordering of … … 3211 3215 3212 3216 \begin{multicols}{2} 3213 \lstset{language=CFA,moredelim=**[is][\color{red}]{@}{@},deletedelim=**[is][]{`}{`}} 3214 \begin{cfa}[aboveskip=0pt,belowskip=0pt] 3215 @coroutine@ C {}; 3216 void main( C & ) { for () { @suspend;@ } } 3217 \begin{cfa}[xleftmargin=0pt] 3218 `coroutine` C {}; 3219 void main( C & ) { for () { `suspend;` } } 3217 3220 int main() { // coroutine test 3218 3221 C c; 3219 BENCH( for ( N ) { @resume( c );@} )3222 BENCH( for ( N ) { `resume( c );` } ) 3220 3223 sout | result; 3221 3224 } 3222 3225 int main() { // thread test 3223 BENCH( for ( N ) { @yield();@} )3226 BENCH( for ( N ) { `yield();` } ) 3224 3227 sout | result; 3225 3228 } … … 3234 3237 \label{t:ctx-switch} 3235 3238 \begin{tabular}{@{}r*{3}{D{.}{.}{3.2}}@{}} 3236 \multicolumn{1}{@{}c}{} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3237 C function & 1.8 & 1.8 & 0.0 \\ 3238 \CFA generator & 1.8 & 2.0 & 0.3 \\ 3239 \CFA coroutine & 32.5 & 32.9 & 0.8 \\ 3240 \CFA thread & 93.8 & 93.6 & 2.2 \\ 3241 \uC coroutine & 50.3 & 50.3 & 0.2 \\ 3242 \uC thread & 97.3 & 97.4 & 1.0 \\ 3243 Python generator & 40.9 & 41.3 & 1.5 \\ 3244 Node.js await & 1852.2 & 1854.7 & 16.4 \\ 3245 Node.js generator & 33.3 & 33.4 & 0.3 \\ 3246 Goroutine thread & 143.0 & 143.3 & 1.1 \\ 3247 Rust async await & 32.0 & 32.0 & 0.0 \\ 3248 Rust tokio thread & 143.0 & 143.0 & 1.7 \\ 3249 Rust thread & 332.0 & 331.4 & 2.4 \\ 3250 Java thread & 405.0 & 415.0 & 17.6 \\ 3251 Pthreads thread & 334.3 & 335.2 & 3.9 3239 \multicolumn{1}{@{}r}{N\hspace*{10pt}} & \multicolumn{1}{c}{Median} &\multicolumn{1}{c}{Average} & \multicolumn{1}{c@{}}{Std Dev} \\ 3240 C function (10B) & 1.8 & 1.8 & 0.0 \\ 3241 \CFA generator (5B) & 1.8 & 2.0 & 0.3 \\ 3242 \CFA coroutine (100M) & 32.5 & 32.9 & 0.8 \\ 3243 \CFA thread (100M) & 93.8 & 93.6 & 2.2 \\ 3244 \uC coroutine (100M) & 50.3 & 50.3 & 0.2 \\ 3245 \uC thread (100M) & 97.3 & 97.4 & 1.0 \\ 3246 Python generator (100M) & 40.9 & 41.3 & 1.5 \\ 3247 Node.js await (5M) & 1852.2 & 1854.7 & 16.4 \\ 3248 Node.js generator (100M) & 33.3 & 33.4 & 0.3 \\ 3249 Goroutine thread (100M) & 143.0 & 143.3 & 1.1 \\ 3250 Rust async await (100M) & 32.0 & 32.0 & 0.0 \\ 3251 Rust tokio thread (100M) & 143.0 & 143.0 & 1.7 \\ 3252 Rust thread (25M) & 332.0 & 331.4 & 2.4 \\ 3253 Java thread (100M) & 405.0 & 415.0 & 17.6 \\ 3254 % Java thread ( 100 000 000) & 413.0 & 414.2 & 6.2 \\ 3255 % Java thread (5 000 000 000) & 415.0 & 415.2 & 6.1 \\ 3256 Pthreads thread (25M) & 334.3 & 335.2 & 3.9 3252 3257 \end{tabular} 3253 3258 \end{multicols} … … 3258 3263 Languages using 1:1 threading based on pthreads can at best meet or exceed, due to language overhead, the pthread results. 3259 3264 Note, pthreads has a fast zero-contention mutex lock checked in user space. 3260 Languages with M:N threading have better performance than 1:1 because there is no operating-system interactions. 3265 Languages with M:N threading have better performance than 1:1 because there is no operating-system interactions (context-switching or locking). 3266 As well, for locking experiments, M:N threading has less contention if only one kernel thread is used. 3261 3267 Languages with stackful coroutines have higher cost than stackless coroutines because of stack allocation and context switching; 3262 3268 however, stackful \uC and \CFA coroutines have approximately the same performance as stackless Python and Node.js generators. 3263 3269 The \CFA stackless generator is approximately 25 times faster for suspend/resume and 200 times faster for creation than stackless Python and Node.js generators. 3270 The Node.js context-switch is costly when asynchronous await must enter the event engine because a promise is not fulfilled. 3271 Finally, the benchmark results correlate across programming languages with and without JIT, indicating the JIT has completed any runtime optimizations. 3264 3272 3265 3273 … … 3319 3327 3320 3328 The authors recognize the design assistance of Aaron Moss, Rob Schluntz, Andrew Beach, and Michael Brooks; David Dice for commenting and helping with the Java benchmarks; and Gregor Richards for helping with the Node.js benchmarks. 3321 This research is funded by a grant fromWaterloo-Huawei (\url{http://www.huawei.com}) Joint Innovation Lab. %, and Peter Buhr is partially funded by the Natural Sciences and Engineering Research Council of Canada.3329 This research is funded by the NSERC/Waterloo-Huawei (\url{http://www.huawei.com}) Joint Innovation Lab. %, and Peter Buhr is partially funded by the Natural Sciences and Engineering Research Council of Canada. 3322 3330 3323 3331 {% -
doc/papers/concurrency/annex/local.bib
rae2c27a rc76bd34 59 59 @manual{Cpp-Transactions, 60 60 keywords = {C++, Transactional Memory}, 61 title = {Tech nical Specificationfor C++ Extensions for Transactional Memory},61 title = {Tech. Spec. for C++ Extensions for Transactional Memory}, 62 62 organization= {International Standard ISO/IEC TS 19841:2015 }, 63 63 publisher = {American National Standards Institute}, -
doc/papers/concurrency/mail2
rae2c27a rc76bd34 959 959 Software: Practice and Experience Editorial Office 960 960 961 962 963 Date: Wed, 2 Sep 2020 20:55:34 +0000 964 From: Richard Jones <onbehalfof@manuscriptcentral.com> 965 Reply-To: R.E.Jones@kent.ac.uk 966 To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca 967 Subject: Software: Practice and Experience - Decision on Manuscript ID 968 SPE-19-0219.R2 969 970 02-Sep-2020 971 972 Dear Dr Buhr, 973 974 Many thanks for submitting SPE-19-0219.R2 entitled "Advanced Control-flow and Concurrency in Cforall" to Software: Practice and Experience. The paper has now been reviewed and the comments of the referees are included at the bottom of this letter. I apologise for the length of time it has taken to get these. 975 976 Both reviewers consider this paper to be close to acceptance. However, before I can accept this paper, I would like you address the comments of Reviewer 2, particularly with regard to the description of the adaptation Java harness to deal with warmup. I would expect to see a convincing argument that the computation has reached a steady state. I would also like you to provide the values for N for each benchmark run. This should be very straightforward for you to do. There are a couple of papers on steady state that you may wish to consult (though I am certainly not pushing my own work). 977 978 1) Barrett, Edd; Bolz-Tereick, Carl Friedrich; Killick, Rebecca; Mount, Sarah and Tratt, Laurence. Virtual Machine Warmup Blows Hot and Cold. OOPSLA 2017. https://doi.org/10.1145/3133876 979 Virtual Machines (VMs) with Just-In-Time (JIT) compilers are traditionally thought to execute programs in two phases: the initial warmup phase determines which parts of a program would most benefit from dynamic compilation, before JIT compiling those parts into machine code; subsequently the program is said to be at a steady state of peak performance. Measurement methodologies almost always discard data collected during the warmup phase such that reported measurements focus entirely on peak performance. We introduce a fully automated statistical approach, based on changepoint analysis, which allows us to determine if a program has reached a steady state and, if so, whether that represents peak performance or not. Using this, we show that even when run in the most controlled of circumstances, small, deterministic, widely studied microbenchmarks often fail to reach a steady state of peak performance on a variety of common VMs. Repeating our experiment on 3 different machines, we found that at most 43.5% of pairs consistently reach a steady state of peak performance. 980 981 2) Kalibera, Tomas and Jones, Richard. Rigorous Benchmarking in Reasonable Time. ISMM 2013. https://doi.org/10.1145/2555670.2464160 982 Experimental evaluation is key to systems research. Because modern systems are complex and non-deterministic, good experimental methodology demands that researchers account for uncertainty. To obtain valid results, they are expected to run many iterations of benchmarks, invoke virtual machines (VMs) several times, or even rebuild VM or benchmark binaries more than once. All this repetition costs time to complete experiments. Currently, many evaluations give up on sufficient repetition or rigorous statistical methods, or even run benchmarks only in training sizes. The results reported often lack proper variation estimates and, when a small difference between two systems is reported, some are simply unreliable.In contrast, we provide a statistically rigorous methodology for repetition and summarising results that makes efficient use of experimentation time. Time efficiency comes from two key observations. First, a given benchmark on a given platform is typically prone to much less non-determinism than the common worst-case of published corner-case studies. Second, repetition is most needed where most uncertainty arises (whether between builds, between executions or between iterations). We capture experimentation cost with a novel mathematical model, which we use to identify the number of repetitions at each level of an experiment necessary and sufficient to obtain a given level of precision.We present our methodology as a cookbook that guides researchers on the number of repetitions they should run to obtain reliable results. We also show how to present results with an effect size confidence interval. As an example, we show how to use our methodology to conduct throughput experiments with the DaCapo and SPEC CPU benchmarks on three recent platforms. 983 984 You have 42 days from the date of this email to submit your revision. If you are unable to complete the revision within this time, please contact me to request a short extension. 985 986 You can upload your revised manuscript and submit it through your Author Center. Log into https://mc.manuscriptcentral.com/spe and enter your Author Center, where you will find your manuscript title listed under "Manuscripts with Decisions". 987 988 When submitting your revised manuscript, you will be able to respond to the comments made by the referee(s) in the space provided. You can use this space to document any changes you make to the original manuscript. 989 990 If you would like help with English language editing, or other article preparation support, Wiley Editing Services offers expert help with English Language Editing, as well as translation, manuscript formatting, and figure formatting at www.wileyauthors.com/eeo/preparation. You can also check out our resources for Preparing Your Article for general guidance about writing and preparing your manuscript at www.wileyauthors.com/eeo/prepresources. 991 992 Once again, thank you for submitting your manuscript to Software: Practice and Experience. I look forward to receiving your revision. 993 994 Sincerely, 995 Richard 996 997 Prof. Richard Jones 998 Editor, Software: Practice and Experience 999 R.E.Jones@kent.ac.uk 1000 1001 Referee(s)' Comments to Author: 1002 1003 Reviewing: 1 1004 1005 Comments to the Author 1006 Overall, I felt that this draft was an improvement on previous drafts and I don't have further changes to request. 1007 1008 I appreciated the new language to clarify the relationship of external and internal scheduling, for example, as well as the new measurements of Rust tokio. Also, while I still believe that the choice between thread/generator/coroutine and so forth could be made crisper and clearer, the current draft of Section 2 did seem adequate to me in terms of specifying the considerations that users would have to take into account to make the choice. 1009 1010 1011 Reviewing: 2 1012 1013 Comments to the Author 1014 First: let me apologise for the delay on this review. I'll blame the global pandemic combined with my institution's senior management's counterproductive decisions for taking up most of my time and all of my energy. 1015 1016 At this point, reading the responses, I think we've been around the course enough times that further iteration is unlikely to really improve the paper any further, so I'm happy to recommend acceptance. My main comments are that there were some good points in the responses to *all* the reviews and I strongly encourage the authors to incorporate those discursive responses into the final paper so they may benefit readers as well as reviewers. I agree with the recommendations of reviewer #2 that the paper could usefully be split in to two, which I think I made to a previous revision, but I'm happy to leave that decision to the Editor. 1017 1018 Finally, the paper needs to describe how the Java harness was adapted to deal with warmup; why the computation has warmed up and reached a steady state - similarly for js and Python. The tables should also give the "N" chosen for each benchmark run. 1019 1020 minor points 1021 * don't start sentences with "However" 1022 * most downloaded isn't an "Award" 1023 1024 1025 1026 Date: Thu, 1 Oct 2020 05:34:29 +0000 1027 From: Richard Jones <onbehalfof@manuscriptcentral.com> 1028 Reply-To: R.E.Jones@kent.ac.uk 1029 To: pabuhr@uwaterloo.ca 1030 Subject: Revision reminder - SPE-19-0219.R2 1031 1032 01-Oct-2020 1033 1034 Dear Dr Buhr 1035 1036 SPE-19-0219.R2 1037 1038 This is a reminder that your opportunity to revise and re-submit your manuscript will expire 14 days from now. If you require more time please contact me directly and I may grant an extension to this deadline, otherwise the option to submit a revision online, will not be available. 1039 1040 If your article is of potential interest to the general public, (which means it must be timely, groundbreaking, interesting and impact on everyday society) then please e-mail ejp@wiley.co.uk explaining the public interest side of the research. Wiley will then investigate the potential for undertaking a global press campaign on the article. 1041 1042 I look forward to receiving your revision. 1043 1044 Sincerely, 1045 1046 Prof. Richard Jones 1047 Editor, Software: Practice and Experience 1048 1049 https://mc.manuscriptcentral.com/spe 1050 1051 1052 1053 Date: Tue, 6 Oct 2020 15:29:41 +0000 1054 From: Mayank Roy Chowdhury <onbehalfof@manuscriptcentral.com> 1055 Reply-To: speoffice@wiley.com 1056 To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca 1057 Subject: SPE-19-0219.R3 successfully submitted 1058 1059 06-Oct-2020 1060 1061 Dear Dr Buhr, 1062 1063 Your manuscript entitled "Advanced Control-flow and Concurrency in Cforall" has been successfully submitted online and is presently being given full consideration for publication in Software: Practice and Experience. 1064 1065 Your manuscript number is SPE-19-0219.R3. Please mention this number in all future correspondence regarding this submission. 1066 1067 You can view the status of your manuscript at any time by checking your Author Center after logging into https://mc.manuscriptcentral.com/spe. If you have difficulty using this site, please click the 'Get Help Now' link at the top right corner of the site. 1068 1069 1070 Thank you for submitting your manuscript to Software: Practice and Experience. 1071 1072 Sincerely, 1073 1074 Software: Practice and Experience Editorial Office 1075 -
doc/refrat/refrat.tex
rae2c27a rc76bd34 11 11 %% Created On : Wed Apr 6 14:52:25 2016 12 12 %% Last Modified By : Peter A. Buhr 13 %% Last Modified On : Wed Jan 31 17:30:23 201814 %% Update Count : 1 0813 %% Last Modified On : Mon Oct 5 09:02:53 2020 14 %% Update Count : 110 15 15 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 16 16 … … 30 30 \usepackage{upquote} % switch curled `'" to straight 31 31 \usepackage{calc} 32 \usepackage{xspace}33 32 \usepackage{varioref} % extended references 34 \usepackage{listings} % format program code35 33 \usepackage[flushmargin]{footmisc} % support label/reference in footnote 36 34 \usepackage{latexsym} % \Box glyph 37 35 \usepackage{mathptmx} % better math font with "times" 38 36 \usepackage[usenames]{color} 39 \input{common} % common CFA document macros 40 \usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref} 41 \usepackage{breakurl} 42 \renewcommand{\UrlFont}{\small\sf} 43 44 \usepackage[pagewise]{lineno} 45 \renewcommand{\linenumberfont}{\scriptsize\sffamily} 46 \usepackage[firstpage]{draftwatermark} 47 \SetWatermarkLightness{0.9} 48 49 % Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore 50 % removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR 51 % AFTER HYPERREF. 52 \renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}} 53 54 \setlength{\topmargin}{-0.45in} % move running title into header 55 \setlength{\headsep}{0.25in} 56 57 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 58 59 \CFAStyle % use default CFA format-style 60 \lstnewenvironment{C++}[1][] % use C++ style 61 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®}#1}} 62 {} 63 37 \newcommand{\CFALatin}{} 64 38 % inline code ©...© (copyright symbol) emacs: C-q M-) 65 39 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-. … … 69 43 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^ 70 44 % math escape $...$ (dollar symbol) 45 \input{common} % common CFA document macros 46 \usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref} 47 \usepackage{breakurl} 48 \renewcommand{\UrlFont}{\small\sf} 49 50 \usepackage[pagewise]{lineno} 51 \renewcommand{\linenumberfont}{\scriptsize\sffamily} 52 \usepackage[firstpage]{draftwatermark} 53 \SetWatermarkLightness{0.9} 54 55 % Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore 56 % removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR 57 % AFTER HYPERREF. 58 \renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}} 59 60 \setlength{\topmargin}{-0.45in} % move running title into header 61 \setlength{\headsep}{0.25in} 71 62 72 63 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 73 64 65 \CFAStyle % use default CFA format-style 66 \lstnewenvironment{C++}[1][] % use C++ style 67 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}} 68 {} 69 70 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 71 74 72 % Names used in the document. 75 \newcommand{\Version}{\input{ ../../version}}73 \newcommand{\Version}{\input{build/version}} 76 74 \newcommand{\Textbf}[2][red]{{\color{#1}{\textbf{#2}}}} 77 75 \newcommand{\Emph}[2][red]{{\color{#1}\textbf{\emph{#2}}}} -
doc/theses/andrew_beach_MMath/thesis.tex
rae2c27a rc76bd34 34 34 \usepackage[toc,abbreviations]{glossaries-extra} 35 35 36 % Main glossary entries -- definitions of relevant terminology 37 \newglossaryentry{computer} 38 { 39 name=computer, 40 description={A programmable machine that receives input data, 41 stores and manipulates the data, and provides 42 formatted output} 43 } 44 45 % Nomenclature glossary entries -- New definitions, or unusual terminology 46 \newglossary*{nomenclature}{Nomenclature} 47 \newglossaryentry{dingledorf} 48 { 49 type=nomenclature, 50 name=dingledorf, 51 description={A person of supposed average intelligence who makes incredibly 52 brainless misjudgments} 53 } 54 55 % List of Abbreviations (abbreviations are from the glossaries-extra package) 56 \newabbreviation{aaaaz}{AAAAZ}{American Association of Amature Astronomers 57 and Zoologists} 58 59 % List of Symbols 60 \newglossary*{symbols}{List of Symbols} 61 \newglossaryentry{rvec} 62 { 63 name={$\mathbf{v}$}, 64 sort={label}, 65 type=symbols, 66 description={Random vector: a location in n-dimensional Cartesian space, where 67 each dimensional component is determined by a random process} 68 } 36 % Define all the glossaries. 37 \input{glossaries} 69 38 70 39 % Generate the glossaries defined above. -
doc/theses/thierry_delisle_PhD/.gitignore
rae2c27a rc76bd34 11 11 comp_II/comp_II.pdf 12 12 comp_II/comp_II.ps 13 comp_II/presentation.pdf 14 15 thesis/build/ 16 thesis/fig/*.fig.bak 17 thesis/thesis.pdf 18 thesis/thesis.ps 13 19 14 20 !Makefile -
doc/theses/thierry_delisle_PhD/comp_II/comp_II.tex
rae2c27a rc76bd34 60 60 \section{Introduction} 61 61 \subsection{\CFA and the \CFA concurrency package} 62 \CFA \cite{Moss18} is a modern, polymorphic, non-object-oriented, concurrent, backwards-compatible extension of the C programming language.62 \CFA~\cite{Moss18} is a modern, polymorphic, non-object-oriented, concurrent, backwards-compatible extension of the C programming language. 63 63 It aims to add high-productivity features while maintaining the predictable performance of C. 64 As such, concurrency in \CFA \cite{Delisle19} aims to offer simple and safe high-level tools while still allowing performant code.65 \CFA concurrent code is written in the synchronous programming paradigm but uses \glspl{uthrd} in orderto achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programming.64 As such, concurrency in \CFA~\cite{Delisle19} aims to offer simple and safe high-level tools while still allowing performant code. 65 \CFA concurrent code is written in the synchronous programming paradigm but uses \glspl{uthrd} to achieve the simplicity and maintainability of synchronous programming without sacrificing the efficiency of asynchronous programming. 66 66 As such, the \CFA \newterm{scheduler} is a preemptive user-level scheduler that maps \glspl{uthrd} onto \glspl{kthrd}. 67 67 68 \subsection{Scheduling} 68 69 \newterm{Scheduling} occurs when execution switches from one thread to another, where the second thread is implicitly chosen by the scheduler. 69 This scheduling is an indirect handoff, as opposed to generators and coroutines whichexplicitly switch to the next generator and coroutine respectively.70 This scheduling is an indirect handoff, as opposed to generators and coroutines that explicitly switch to the next generator and coroutine respectively. 70 71 The cost of switching between two threads for an indirect handoff has two components: 71 72 \begin{enumerate} … … 75 76 and the cost of scheduling, \ie deciding which thread to run next among all the threads ready to run. 76 77 \end{enumerate} 77 The first cost is generally constant and fixed\footnote{Affecting the constant context-switch cost is whether it is done in one step, after the scheduling, or in two steps, context-switching to a third fixed thread before scheduling.}, while the scheduling cost can vary based on the system state.78 Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, \ie \newterm{linearizability}\footnote{Meaning , however fast the CPU threads run, there is an equivalent sequential order that gives the same result.}, and a new dimension to performance: scalability, where scheduling cost nowalso depends on contention.78 The first cost is generally constant\footnote{Affecting the constant context-switch cost is whether it is done in one step, where the first thread schedules the second, or in two steps, where the first thread context switches to a third scheduler thread.}, while the scheduling cost can vary based on the system state. 79 Adding multiple \glspl{kthrd} does not fundamentally change the scheduler semantics or requirements, it simply adds new correctness requirements, \ie \newterm{linearizability}\footnote{Meaning however fast the CPU threads run, there is an equivalent sequential order that gives the same result.}, and a new dimension to performance: scalability, where scheduling cost also depends on contention. 79 80 The more threads switch, the more the administration cost of scheduling becomes noticeable. 80 81 It is therefore important to build a scheduler with the lowest possible cost and latency. 81 82 Another important consideration is \newterm{fairness}. 82 83 In principle, scheduling should give the illusion of perfect fairness, where all threads ready to run are running \emph{simultaneously}. 84 In practice, there can be advantages to unfair scheduling, similar to the express cash register at a grocery store. 83 85 While the illusion of simultaneity is easier to reason about, it can break down if the scheduler allows too much unfairness. 84 86 Therefore, the scheduler should offer as much fairness as needed to guarantee eventual progress, but use unfairness to help performance. 85 In practice, threads must wait in turn but there can be advantages to unfair scheduling, similar to the express cash register at a grocery store. 86 87 The goal of this research is to produce a scheduler that is simple for programmers to understand and offers good performance.87 88 \subsection{Research Goal} 89 The goal of this research is to produce a scheduler that is simple for programmers to understand and offers good general performance. 88 90 Here understandability does not refer to the API but to how much scheduling concerns programmers need to take into account when writing a \CFA concurrent package. 89 Therefore, the main goal of this proposal is :91 Therefore, the main consequence of this goal is : 90 92 \begin{quote} 91 93 The \CFA scheduler should be \emph{viable} for \emph{any} workload. 92 94 \end{quote} 93 95 94 For a general-purpose scheduler, it is impossible to produce an optimal algorithm as it would requireknowledge of the future behaviour of threads.95 As such, scheduling performance is generally either defined by the best-case scenario, \ie a workload to which the scheduler is tailored, or theworst-case scenario, \ie the scheduler behaves no worse than \emph{X}.96 For a general-purpose scheduler, it is impossible to produce an optimal algorithm as that requires knowledge of the future behaviour of threads. 97 As such, scheduling performance is generally either defined by a best-case scenario, \ie a workload to which the scheduler is tailored, or a worst-case scenario, \ie the scheduler behaves no worse than \emph{X}. 96 98 For this proposal, the performance is evaluated using the second approach to allow \CFA programmers to rely on scheduling performance. 97 99 Because there is no optimal scheduler, ultimately \CFA may allow programmers to write their own scheduler; but that is not the subject of this proposal, which considers only the default scheduler. … … 103 105 \item creating an abstraction layer over the operating system to handle kernel-threads spinning unnecessarily, 104 106 \item scheduling blocking I/O operations, 105 \item and writing sufficient library tools to allow developers to indirectly use the scheduler, either through tuning knobs or replacing the default scheduler.107 \item and writing sufficient library tools to allow developers to indirectly use the scheduler, either through tuning knobs in the default scheduler or replacing the default scheduler. 106 108 \end{enumerate} 107 109 … … 119 121 \paragraph{Performance} The performance of a scheduler can generally be measured in terms of scheduling cost, scalability and latency. 120 122 \newterm{Scheduling cost} is the cost to switch from one thread to another, as mentioned above. 121 For simple applications, where a single kernel thread does most of the scheduling, it is generally the dominating cost. 122 \newterm{Scalability} is the cost of adding multiple kernel threads because it increases the time for context switching because of contention by multiple threads accessing shared resources, \eg the ready queue. 123 For compute-bound concurrent applications with little context switching, the scheduling cost is negligible. 124 For applications with high context-switch rates, scheduling cost can begin to dominating the cost. 125 \newterm{Scalability} is the cost of adding multiple kernel threads. 126 It can increase the time for scheduling because of contention from the multiple threads accessing shared resources, \eg a single ready queue. 123 127 Finally, \newterm{tail latency} is service delay and relates to thread fairness. 124 Specifically, latency measures how long a thread waits to run once scheduled and is evaluated inthe worst case.128 Specifically, latency measures how long a thread waits to run once scheduled and is evaluated by the worst case. 125 129 The \CFA scheduler should offer good performance for all three metrics. 126 130 … … 128 132 \newterm{Eventual progress} guarantees every scheduled thread is eventually run, \ie prevent starvation. 129 133 As a hard requirement, the \CFA scheduler must guarantee eventual progress, otherwise the above-mentioned illusion of simultaneous execution is broken and the scheduler becomes much more complex to reason about. 130 \newterm{Predictability} and \newterm{reliability} mean similar workloads achieve similar performance andprogrammer execution intuition is respected.131 For example, a thread that yields aggressively should not run more often than other t asks.134 \newterm{Predictability} and \newterm{reliability} mean similar workloads achieve similar performance so programmer execution intuition is respected. 135 For example, a thread that yields aggressively should not run more often than other threads. 132 136 While this is intuitive, it does not hold true for many work-stealing or feedback based schedulers. 133 The \CFA scheduler must guarantee eventual progress and should be predictableand offer reliable performance.137 The \CFA scheduler must guarantee eventual progress, should be predictable, and offer reliable performance. 134 138 135 139 \paragraph{Efficiency} Finally, efficient usage of CPU resources is also an important requirement and is discussed in depth towards the end of the proposal. 136 \newterm{Efficiency} means avoiding using CPU cycles when there are no threads to run , and conversely, use all CPUs availablewhen the workload can benefit from it.140 \newterm{Efficiency} means avoiding using CPU cycles when there are no threads to run (to conserve energy), and conversely, using as many available CPU cycles when the workload can benefit from it. 137 141 Balancing these two states is where the complexity lies. 138 142 The \CFA scheduler should be efficient with respect to the underlying (shared) computer. … … 146 150 \begin{enumerate} 147 151 \item Threads live long enough for useful feedback information to be gathered. 148 \item Threads belong to multiple users so fairness across threads is insufficient.152 \item Threads belong to multiple users so fairness across users is important. 149 153 \end{enumerate} 150 154 … … 159 163 In the case of the \CFA scheduler, every thread runs in the same user space and is controlled by the same user. 160 164 Fairness across users is therefore a given and it is then possible to safely ignore the possibility that threads are malevolent. 161 This approach allows for a much simpler fairness metric and in this proposal \emph{fairness} is defined as: when multiple threads are cycling through the system, the total ordering of threads being scheduled, \ie pushed onto the ready queue, should not differ much from the total ordering of threads being executed, \ie popped from the ready queue. 165 This approach allows for a much simpler fairness metric, and in this proposal, \emph{fairness} is defined as: 166 \begin{quote} 167 When multiple threads are cycling through the system, the total ordering of threads being scheduled, \ie pushed onto the ready queue, should not differ much from the total ordering of threads being executed, \ie popped from the ready queue. 168 \end{quote} 162 169 163 170 Since feedback is not necessarily feasible within the lifetime of all threads and a simple fairness metric can be used, the scheduling strategy proposed for the \CFA runtime does not use per-threads feedback. … … 169 176 Threads with equal priority are scheduled using a secondary strategy, often something simple like round robin or FIFO. 170 177 A consequence of priority is that, as long as there is a thread with a higher priority that desires to run, a thread with a lower priority does not run. 171 Th is possible starving of threads can dramatically increaseprogramming complexity since starving threads and priority inversion (prioritizing a lower priority thread) can both lead to serious problems.178 The potential for thread starvation dramatically increases programming complexity since starving threads and priority inversion (prioritizing a lower priority thread) can both lead to serious problems. 172 179 173 180 An important observation is that threads do not need to have explicit priorities for problems to occur. 174 Indeed, any system with multiple ready queues that attempts to exhaust one queue before accessing the other queues, essentially provide implicit priority, which can encounter starvation problems.181 Indeed, any system with multiple ready queues that attempts to exhaust one queue before accessing the other queues, essentially provides implicit priority, which can encounter starvation problems. 175 182 For example, a popular scheduling strategy that suffers from implicit priorities is work stealing. 176 183 \newterm{Work stealing} is generally presented as follows: … … 180 187 \item If a processor's ready queue is empty, attempt to run threads from some other processor's ready queue. 181 188 \end{enumerate} 182 183 189 In a loaded system\footnote{A \newterm{loaded system} is a system where threads are being run at the same rate they are scheduled.}, if a thread does not yield, block, or preempt for an extended period of time, threads on the same processor's list starve if no other processors exhaust their list. 184 190 185 Since priorities can be complex for programmers to incorporate into their execution intuition, the scheduling strategy proposed for the \CFA runtime does not use a strategy with either implicit or explicit threadpriorities.191 Since priorities can be complex for programmers to incorporate into their execution intuition, the \CFA scheduling strategy does not provided explicit priorities and attempts to eliminate implicit priorities. 186 192 187 193 \subsection{Schedulers without feedback or priorities} … … 191 197 Thankfully, strict FIFO is not needed for sufficient fairness. 192 198 Since concurrency is inherently non-deterministic, fairness concerns in scheduling are only a problem if a thread repeatedly runs before another thread can run. 193 Some relaxation is possible because non-determinism means programmers already handle ordering problems to produce correct code and hence rely on weak guarantees, \eg that a specific thread will \emph{eventually} run.199 Some relaxation is possible because non-determinism means programmers already handle ordering problems to produce correct code and hence rely on weak guarantees, \eg that a thread \emph{eventually} runs. 194 200 Since some reordering does not break correctness, the FIFO fairness guarantee can be significantly relaxed without causing problems. 195 201 For this proposal, the target guarantee is that the \CFA scheduler provides \emph{probable} FIFO ordering, which allows reordering but makes it improbable that threads are reordered far from their position in total ordering. 196 202 197 203 The \CFA scheduler fairness is defined as follows: 198 \begin{ itemize}199 \itemGiven two threads $X$ and $Y$, the odds that thread $X$ runs $N$ times \emph{after} thread $Y$ is scheduled but \emph{before} it is run, decreases exponentially with regard to $N$.200 \end{ itemize}204 \begin{quote} 205 Given two threads $X$ and $Y$, the odds that thread $X$ runs $N$ times \emph{after} thread $Y$ is scheduled but \emph{before} it is run, decreases exponentially with regard to $N$. 206 \end{quote} 201 207 While this is not a bounded guarantee, the probability that unfairness persist for long periods of times decreases exponentially, making persisting unfairness virtually impossible. 202 208 … … 210 216 The described queue uses an array of underlying strictly FIFO queues as shown in Figure~\ref{fig:base}\footnote{For this section, the number of underlying queues is assumed to be constant. 211 217 Section~\ref{sec:resize} discusses resizing the array.}. 212 Pushing new data is done by selecting one of the se underlying queues at random, recording a timestamp for the operationand pushing to the selected queue.218 Pushing new data is done by selecting one of the underlying queues at random, recording a timestamp for the operation, and pushing to the selected queue. 213 219 Popping is done by selecting two queues at random and popping from the queue with the oldest timestamp. 214 A higher number of underlying queues lead to less contention on each queue and therefore better performance.215 In a loaded system, it is highly likely the queues are non-empty, \ie several t asks are on each of the underlying queues.216 This means thatselecting a queue at random to pop from is highly likely to yield a queue with available items.220 A higher number of underlying queues leads to less contention on each queue and therefore better performance. 221 In a loaded system, it is highly likely the queues are non-empty, \ie several threads are on each of the underlying queues. 222 For this case, selecting a queue at random to pop from is highly likely to yield a queue with available items. 217 223 In Figure~\ref{fig:base}, ignoring the ellipsis, the chances of getting an empty queue is 2/7 per pick, meaning two random picks yield an item approximately 9 times out of 10. 218 224 … … 221 227 \input{base.pstex_t} 222 228 \end{center} 223 \caption{ Relaxed FIFO list at the base of the scheduler:an array of strictly FIFO lists.224 The timestamp is in all nodes and cell arrays.}229 \caption{Loaded relaxed FIFO list base on an array of strictly FIFO lists. 230 A timestamp appears in each node and array cell.} 225 231 \label{fig:base} 226 232 \end{figure} … … 230 236 \input{empty.pstex_t} 231 237 \end{center} 232 \caption{ ``More empty'' state of the queue:the array contains many empty cells.}238 \caption{Underloaded relaxed FIFO list where the array contains many empty cells.} 233 239 \label{fig:empty} 234 240 \end{figure} 235 241 236 When the ready queue is \emph{more empty}, \ie several of the queues are empty,selecting a random queue for popping is less likely to yield a successful selection and more attempts are needed, resulting in a performance degradation.242 In an underloaded system, several of the queues are empty, so selecting a random queue for popping is less likely to yield a successful selection and more attempts are needed, resulting in a performance degradation. 237 243 Figure~\ref{fig:empty} shows an example with fewer elements, where the chances of getting an empty queue is 5/7 per pick, meaning two random picks yield an item only half the time. 238 244 Since the ready queue is not empty, the pop operation \emph{must} find an element before returning and therefore must retry. … … 262 268 \end{table} 263 269 264 Performance can be improved in case~D (Table~\ref{tab:perfcases})by adding information to help processors find which inner queues are used.270 Performance can be improved in Table~\ref{tab:perfcases} case~D by adding information to help processors find which inner queues are used. 265 271 This addition aims to avoid the cost of retrying the pop operation but does not affect contention on the underlying queues and can incur some management cost for both push and pop operations. 266 272 The approach used to encode this information can vary in density and be either global or local. … … 273 279 With a multi-word bitmask, this maximum limit can be increased arbitrarily, but it is not possible to check if the queue is empty by reading the bitmask atomically. 274 280 275 Finally, a dense bitmap, either single or multi-word, causes additional problems in case C (Table 1), because many processors are continuously scanning the bitmask to find the few available threads.281 Finally, a dense bitmap, either single or multi-word, causes additional problems in Table~\ref{tab:perfcases} case C, because many processors are continuously scanning the bitmask to find the few available threads. 276 282 This increased contention on the bitmask(s) reduces performance because of cache misses after updates and the bitmask is updated more frequently by the scanning processors racing to read and/or update that information. 277 283 This increased update frequency means the information in the bitmask is more often stale before a processor can use it to find an item, \ie mask read says there are available user threads but none on queue. … … 279 285 \begin{figure} 280 286 \begin{center} 281 {\resizebox{0.8\textwidth}{!}{\input{emptybit}}} 282 \end{center} 283 \caption{``More empty'' queue with added bitmask to indicate which array cells have items.} 287 {\resizebox{0.73\textwidth}{!}{\input{emptybit}}} 288 \end{center} 289 \vspace*{-5pt} 290 \caption{Underloaded queue with added bitmask to indicate which array cells have items.} 284 291 \label{fig:emptybit} 292 \begin{center} 293 {\resizebox{0.73\textwidth}{!}{\input{emptytree}}} 294 \end{center} 295 \vspace*{-5pt} 296 \caption{Underloaded queue with added binary search tree indicate which array cells have items.} 297 \label{fig:emptytree} 298 \begin{center} 299 {\resizebox{0.9\textwidth}{!}{\input{emptytls}}} 300 \end{center} 301 \vspace*{-5pt} 302 \caption{Underloaded queue with added per processor bitmask to indicate which array cells have items.} 303 \label{fig:emptytls} 285 304 \end{figure} 286 305 287 Figure~\ref{fig:emptytree} shows another approach using a hierarchical tree data-structure to reduce contention and has been shown to work in similar cases~\cite{ellen2007snzi}\footnote{This particular paper seems to be patented in the US. 288 How does that affect \CFA? Can I use it in my work?}. 289 However, this approach may lead to poorer performance in case~B (Table~\ref{tab:perfcases}) due to the inherent pointer chasing cost and already low contention cost in that case. 290 291 \begin{figure} 292 \begin{center} 293 {\resizebox{0.8\textwidth}{!}{\input{emptytree}}} 294 \end{center} 295 \caption{``More empty'' queue with added binary search tree indicate which array cells have items.} 296 \label{fig:emptytree} 297 \end{figure} 298 299 Finally, a third approach is to use dense information, similar to the bitmap, but have each thread keep its own independent copy of it. 306 Figure~\ref{fig:emptytree} shows an approach using a hierarchical tree data-structure to reduce contention and has been shown to work in similar cases~\cite{ellen2007snzi}. 307 However, this approach may lead to poorer performance in Table~\ref{tab:perfcases} case~B due to the inherent pointer chasing cost and already low contention cost in that case. 308 309 Figure~\ref{fig:emptytls} shows an approach using dense information, similar to the bitmap, but have each thread keep its own independent copy of it. 300 310 While this approach can offer good scalability \emph{and} low latency, the liveliness of the information can become a problem. 301 In the simple cases, local copies of which underlying queues are emptycan become stale and end-up not being useful for the pop operation.311 In the simple cases, local copies can become stale and end-up not being useful for the pop operation. 302 312 A more serious problem is that reliable information is necessary for some parts of this algorithm to be correct. 303 313 As mentioned in this section, processors must know \emph{reliably} whether the list is empty or not to decide if they can return \texttt{NULL} or if they must keep looking during a pop operation. 304 314 Section~\ref{sec:sleep} discusses another case where reliable information is required for the algorithm to be correct. 305 315 306 \begin{figure}307 \begin{center}308 \input{emptytls}309 \end{center}310 \caption{``More empty'' queue with added per processor bitmask to indicate which array cells have items.}311 \label{fig:emptytls}312 \end{figure}313 314 316 There is a fundamental tradeoff among these approach. 315 Dense global information about empty underlying queues helps zero-contention cases at the cost of high-contention case.316 Sparse global information helps high-contention cases but increases latency in zero-contention -cases,to read and ``aggregate'' the information\footnote{Hierarchical structures, \eg binary search tree, effectively aggregate information but follow pointer chains, learning information at each node.317 Dense global information about empty underlying queues helps zero-contention cases at the cost of the high-contention case. 318 Sparse global information helps high-contention cases but increases latency in zero-contention cases to read and ``aggregate'' the information\footnote{Hierarchical structures, \eg binary search tree, effectively aggregate information but follow pointer chains, learning information at each node. 317 319 Similarly, other sparse schemes need to read multiple cachelines to acquire all the information needed.}. 318 Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases. However the information can become stale making it difficult to use to ensure correctness. 320 Finally, dense local information has both the advantages of low latency in zero-contention cases and scalability in high-contention cases. 321 However, the information can become stale making it difficult to use to ensure correctness. 319 322 The fact that these solutions have these fundamental limits suggest to me a better solution that attempts to combine these properties in an interesting way. 320 323 Also, the lock discussed in Section~\ref{sec:resize} allows for solutions that adapt to the number of processors, which could also prove useful. … … 323 326 324 327 How much scalability is actually needed is highly debatable. 325 \emph{libfibre} \cite{libfibre} has compared favourably to other schedulers in webserver tests\cite{Karsten20} and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask.328 \emph{libfibre}~\cite{libfibre} has compared favourably to other schedulers in webserver tests~\cite{Karsten20} and uses a single atomic counter in its scheduling algorithm similarly to the proposed bitmask. 326 329 As such, the single atomic instruction on a shared cacheline may be sufficiently performant. 327 330 328 I have built a prototype of this ready queue in the shape of a data queue, \ie nodes on the queue are structures with a single intrepresenting a thread and intrusive data fields.329 Using this prototype, I ran preliminary performance experiments thatconfirm the expected performance in Table~\ref{tab:perfcases}.330 However, these experiments only offer a hint at the actual performance of the scheduler since threads form more complex operations than simple integer nodes, \eg threads are not independent of each other,when a thread blocks some other thread must intervene to wake it.331 I have built a prototype of this ready queue in the shape of a data queue, \ie nodes on the queue are structures with a single $int$ representing a thread and intrusive data fields. 332 Using this prototype, preliminary performance experiments confirm the expected performance in Table~\ref{tab:perfcases}. 333 However, these experiments only offer a hint at the actual performance of the scheduler since threads are involved in more complex operations, \eg threads are not independent of each other: when a thread blocks some other thread must intervene to wake it. 331 334 332 335 I have also integrated this prototype into the \CFA runtime, but have not yet created performance experiments to compare results, as creating one-to-one comparisons between the prototype and the \CFA runtime will be complex. … … 345 348 Threads on a cluster are always scheduled on one of the processors of the cluster. 346 349 Currently, the runtime handles dynamically adding and removing processors from clusters at any time. 347 Since this is part of the existing design, the proposed scheduler must also support this behaviour.350 Since this feature is part of the existing design, the proposed scheduler must also support this behaviour. 348 351 However, dynamically resizing a cluster is considered a rare event associated with setup, tear down and major configuration changes. 349 352 This assumption is made both in the design of the proposed scheduler as well as in the original design of the \CFA runtime system. 350 353 As such, the proposed scheduler must honour the correctness of this behaviour but does not have any performance objectives with regard to resizing a cluster. 351 How long adding or removing processors takeand how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long periods of times.354 That is, the time to add or remove processors and how much this disrupts the performance of other threads is considered a secondary concern since it should be amortized over long periods of times. 352 355 However, as mentioned in Section~\ref{sec:queue}, contention on the underlying queues can have a direct impact on performance. 353 356 The number of underlying queues must therefore be adjusted as the number of processors grows or shrinks. … … 371 374 372 375 There are possible alternatives to the reader-writer lock solution. 373 This problem is effectively a memory reclamation problem and as such there is a large body of research on the subject \cite{michael2004hazard, brown2015reclaiming}.376 This problem is effectively a memory reclamation problem and as such there is a large body of research on the subject~\cite{brown2015reclaiming, michael2004hazard}. 374 377 However, the reader-write lock-solution is simple and can be leveraged to solve other problems (\eg processor ordering and memory reclamation of threads), which makes it an attractive solution. 375 378 … … 401 404 Individual processors always finish scheduling user threads before looking for new work, which means that the last processor to go to sleep cannot miss threads scheduled from inside the cluster (if they do, that demonstrates the ready queue is not linearizable). 402 405 However, this guarantee does not hold if threads are scheduled from outside the cluster, either due to an external event like timers and I/O, or due to a user (or kernel) thread migrating from a different cluster. 403 In this case, missed signals can lead to the cluster deadlocking\footnote{Clusters should only deadlock in cases where a \CFA programmer \emph{actually} write \CFA code that leads to a deadlock.}.406 In this case, missed signals can lead to the cluster deadlocking\footnote{Clusters should only deadlock in cases where a \CFA programmer \emph{actually} writes \CFA code that leads to a deadlock.}. 404 407 Therefore, it is important that the scheduling of threads include a mechanism where signals \emph{cannot} be missed. 405 408 For performance reasons, it can be advantageous to have a secondary mechanism that allows signals to be missed in cases where it cannot lead to a deadlock. 406 To be safe, this process must include a ``handshake'' where it is guaranteed that either~: the sleeping processor notices that a user thread is scheduled after the sleeping processor signalled its intent to block or code scheduling threads sees the intent to sleep before scheduling and be able to wake-up the processor. 409 To be safe, this process must include a ``handshake'' where it is guaranteed that either: 410 \begin{enumerate} 411 \item 412 the sleeping processor notices that a user thread is scheduled after the sleeping processor signalled its intent to block or 413 \item 414 code scheduling threads sees the intent to sleep before scheduling and be able to wake-up the processor. 415 \end{enumerate} 407 416 This matter is complicated by the fact that pthreads and Linux offer few tools to implement this solution and no guarantee of ordering of threads waking up for most of these tools. 408 417 409 418 Another important issue is avoiding kernel threads sleeping and waking frequently because there is a significant operating-system cost. 410 This scenario happens when a program oscillates between high and low activity, needing most and then few erprocessors.419 This scenario happens when a program oscillates between high and low activity, needing most and then few processors. 411 420 A possible partial solution is to order the processors so that the one which most recently went to sleep is woken up. 412 421 This allows other sleeping processors to reach deeper sleep state (when these are available) while keeping ``hot'' processors warmer. … … 417 426 Processors that are unnecessarily unblocked lead to unnecessary contention, CPU usage, and power consumption, while too many sleeping processors can lead to suboptimal throughput. 418 427 Furthermore, transitions from sleeping to awake and vice versa also add unnecessary latency. 419 There is already a wealth of research on the subject \cite{schillings1996engineering, wiki:thunderherd} and I may use an existing approach for the idle-sleep heuristic in this project, \eg\cite{Karsten20}.428 There is already a wealth of research on the subject~\cite{schillings1996engineering, wiki:thunderherd} and I may use an existing approach for the idle-sleep heuristic in this project, \eg~\cite{Karsten20}. 420 429 421 430 \subsection{Asynchronous I/O} … … 432 441 an event-engine to (de)multiplex the operations, 433 442 \item 434 and a synchronous interface for users to use.443 and a synchronous interface for users. 435 444 \end{enumerate} 436 445 None of these components currently exist in \CFA and I will need to build all three for this project. 437 446 438 \paragraph{OS A bstraction}439 One fundamental part for converting blocking I/O operations into non-blocking onesis having an underlying asynchronous I/O interface to direct the I/O operations.447 \paragraph{OS Asynchronous Abstraction} 448 One fundamental part for converting blocking I/O operations into non-blocking is having an underlying asynchronous I/O interface to direct the I/O operations. 440 449 While there exists many different APIs for asynchronous I/O, it is not part of this proposal to create a novel API. 441 450 It is sufficient to make one work in the complex context of the \CFA runtime. 442 \uC uses the $select$ \cite{select} as its interface, which handles ttys, pipes and sockets, but not disk.451 \uC uses the $select$~\cite{select} as its interface, which handles ttys, pipes and sockets, but not disk. 443 452 $select$ entails significant complexity and is being replaced in UNIX operating systems, which make it a less interesting alternative. 444 Another popular interface is $epoll$ \cite{epoll}, which is supposed to be cheaper than $select$.445 However, $epoll$ also does not handle the file system and anecdotal evidence suggest it has problems with Linux pipes and $TTY$s.446 A popular cross-platform alternative is $libuv$ \cite{libuv}, which offers asynchronous sockets and asynchronous file system operations (among other features).453 Another popular interface is $epoll$~\cite{epoll}, which is supposed to be cheaper than $select$. 454 However, $epoll$ also does not handle the file system and anecdotal evidence suggest it has problems with Linux pipes and ttys. 455 A popular cross-platform alternative is $libuv$~\cite{libuv}, which offers asynchronous sockets and asynchronous file system operations (among other features). 447 456 However, as a full-featured library it includes much more than I need and could conflict with other features of \CFA unless significant effort is made to merge them together. 448 A very recent alternative that I am investigating is $io_uring$ \cite{io_uring}.457 A very recent alternative that I am investigating is $io_uring$~\cite{io_uring}. 449 458 It claims to address some of the issues with $epoll$ and my early investigating suggests that the claim is accurate. 450 $io_uring$ uses a much more general approach where system calls are registered to a queue and later executed by the kernel, rather than relying on system calls to return an error instead of blocking and subsequently waiting for changes on file descriptors.451 I believe this approach allows for fewer problems, \eg the manpage for $open$ \cite{open} states:459 $io_uring$ uses a much more general approach where system calls are registered to a queue and later executed by the kernel, rather than relying on system calls to support returning an error instead of blocking. 460 I believe this approach allows for fewer problems, \eg the manpage for $open$~\cite{open} states: 452 461 \begin{quote} 453 462 Note that [the $O_NONBLOCK$ flag] has no effect for regular files and block devices; … … 455 464 Since $O_NONBLOCK$ semantics might eventually be implemented, applications should not depend upon blocking behaviour when specifying this flag for regular files and block devices. 456 465 \end{quote} 457 This makes approach based on $epoll$/$select$ less reliable since they may not work for every file descriptors.458 For this reason, I plan to use $io_uring$ as the OS abstraction for the \CFA runtime unless further work shows problems I haven't encountered yet.459 However, only a small subset of the features are available in Ubuntu as of April 2020 \cite{wiki:ubuntu-linux}, which will limit performance comparisons.466 This makes approaches based on $select$/$epoll$ less reliable since they may not work for every file descriptors. 467 For this reason, I plan to use $io_uring$ as the OS abstraction for the \CFA runtime unless further work encounters a fatal problem. 468 However, only a small subset of the features are available in Ubuntu as of April 2020~\cite{wiki:ubuntu-linux}, which will limit performance comparisons. 460 469 I do not believe this will affect the comparison result. 461 470 462 471 \paragraph{Event Engine} 463 Laying on top of the asynchronous interface layeris the event engine.472 Above the OS asynchronous abstraction is the event engine. 464 473 This engine is responsible for multiplexing (batching) the synchronous I/O requests into asynchronous I/O requests and demultiplexing the results to appropriate blocked user threads. 465 474 This step can be straightforward for simple cases, but becomes quite complex when there are thousands of user threads performing both reads and writes, possibly on overlapping file descriptors. … … 478 487 The interface can be novel but it is preferable to match the existing POSIX interface when possible to be compatible with existing code. 479 488 Matching allows C programs written using this interface to be transparently converted to \CFA with minimal effort. 480 Where new functionality is needed, I will create a novel interfaceto fill gaps and provide advanced features.489 Where new functionality is needed, I will add novel interface extensions to fill gaps and provide advanced features. 481 490 482 491 … … 485 494 \section{Discussion} 486 495 I believe that runtime system and scheduling are still open topics. 487 Many ``state of the art'' production frameworks still use single-threaded event loops because of performance considerations, \eg \cite{nginx-design}, and, to my knowledge, no widely available system language offers modern threading facilities.496 Many ``state of the art'' production frameworks still use single-threaded event loops because of performance considerations, \eg~\cite{nginx-design}, and, to my knowledge, no widely available system language offers modern threading facilities. 488 497 I believe the proposed work offers a novel runtime and scheduling package, where existing work only offers fragments that users must assemble themselves when possible. 489 498 -
doc/theses/thierry_delisle_PhD/comp_II/img/system.fig
rae2c27a rc76bd34 1 #FIG 3.2 Produced by xfig version 3.2. 5c1 #FIG 3.2 Produced by xfig version 3.2.7b 2 2 Landscape 3 3 Center … … 36 36 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 3600 15 15 4500 3600 4515 3615 37 37 -6 38 6 3225 4125 4650 442539 6 4350 4200 4650 435040 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4425 4275 15 15 4425 4275 4440 429041 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4500 4275 15 15 4500 4275 4515 429042 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 4575 4275 15 15 4575 4275 4590 429043 -644 1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3450 4275 225 150 3450 4275 3675 442545 1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4050 4275 225 150 4050 4275 4275 442546 -647 6 6675 4125 7500 442548 6 7200 4200 7500 435049 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7275 4275 15 15 7275 4275 7290 429050 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7350 4275 15 15 7350 4275 7365 429051 1 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 7425 4275 15 15 7425 4275 7440 429052 -653 1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6900 4275 225 150 6900 4275 7125 442554 -655 38 6 6675 3525 8025 3975 56 39 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 1 0 2 … … 79 62 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3975 2850 150 150 3975 2850 4125 2850 80 63 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 7200 2775 150 150 7200 2775 7350 2775 81 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 48 6064 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 2250 4830 30 30 2250 4830 2280 4830 82 65 1 3 0 1 0 0 0 0 0 0.000 1 0.0000 7200 2775 30 30 7200 2775 7230 2805 83 66 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3525 3600 150 150 3525 3600 3675 3600 84 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3875 4800 100 100 3875 4800 3975 4800 85 1 1 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4650 4800 150 75 4650 4800 4800 4875 67 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4625 4838 100 100 4625 4838 4725 4838 86 68 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5 87 69 2400 4200 2400 3750 1950 3750 1950 4200 2400 4200 … … 153 135 1 1 1.00 45.00 90.00 154 136 7875 3750 7875 2325 7200 2325 7200 2550 137 2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5 138 6975 4950 6750 4950 6750 4725 6975 4725 6975 4950 155 139 2 2 0 1 -1 -1 0 0 -1 0.000 0 0 0 0 0 5 156 140 5850 4950 5850 4725 5625 4725 5625 4950 5850 4950 157 2 2 1 1 -1 -1 0 0 -1 3.000 0 0 0 0 0 5 158 6975 4950 6750 4950 6750 4725 6975 4725 6975 4950 159 4 1 -1 0 0 0 10 0.0000 2 105 720 5550 4425 Processors\001 160 4 1 -1 0 0 0 10 0.0000 2 120 1005 4200 3225 Blocked Tasks\001 161 4 1 -1 0 0 0 10 0.0000 2 150 870 4200 3975 Ready Tasks\001 162 4 1 -1 0 0 0 10 0.0000 2 135 1095 7350 1725 Other Cluster(s)\001 163 4 1 -1 0 0 0 10 0.0000 2 105 840 4650 1725 User Cluster\001 164 4 1 -1 0 0 0 10 0.0000 2 150 615 2175 3675 Manager\001 165 4 1 -1 0 0 0 10 0.0000 2 105 990 2175 3525 Discrete-event\001 166 4 1 -1 0 0 0 10 0.0000 2 135 795 2175 4350 preemption\001 167 4 0 -1 0 0 0 10 0.0000 2 150 1290 2325 4875 generator/coroutine\001 168 4 0 -1 0 0 0 10 0.0000 2 120 270 4050 4875 task\001 169 4 0 -1 0 0 0 10 0.0000 2 105 450 7050 4875 cluster\001 170 4 0 -1 0 0 0 10 0.0000 2 105 660 5925 4875 processor\001 171 4 0 -1 0 0 0 10 0.0000 2 105 555 4875 4875 monitor\001 141 4 1 -1 0 0 0 10 0.0000 2 135 900 5550 4425 Processors\001 142 4 1 -1 0 0 0 10 0.0000 2 165 1170 4200 3975 Ready Threads\001 143 4 1 -1 0 0 0 10 0.0000 2 165 1440 7350 1725 Other Cluster(s)\001 144 4 1 -1 0 0 0 10 0.0000 2 135 1080 4650 1725 User Cluster\001 145 4 1 -1 0 0 0 10 0.0000 2 165 630 2175 3675 Manager\001 146 4 1 -1 0 0 0 10 0.0000 2 135 1260 2175 3525 Discrete-event\001 147 4 1 -1 0 0 0 10 0.0000 2 150 900 2175 4350 preemption\001 148 4 0 -1 0 0 0 10 0.0000 2 135 630 7050 4875 cluster\001 149 4 1 -1 0 0 0 10 0.0000 2 135 1350 4200 3225 Blocked Threads\001 150 4 0 -1 0 0 0 10 0.0000 2 135 540 4800 4875 thread\001 151 4 0 -1 0 0 0 10 0.0000 2 120 810 5925 4875 processor\001 152 4 0 -1 0 0 0 10 0.0000 2 165 1710 2325 4875 generator/coroutine\001 -
doc/user/Makefile
rae2c27a rc76bd34 55 55 56 56 ${DOCUMENT} : ${BASE}.ps 57 ps2pdf $<57 ps2pdf -dPDFSETTINGS=/prepress $< 58 58 59 59 ${BASE}.ps : ${BASE}.dvi -
doc/user/user.tex
rae2c27a rc76bd34 11 11 %% Created On : Wed Apr 6 14:53:29 2016 12 12 %% Last Modified By : Peter A. Buhr 13 %% Last Modified On : Fri Mar 6 13:34:52202014 %% Update Count : 39 2413 %% Last Modified On : Mon Oct 5 08:57:29 2020 14 %% Update Count : 3998 15 15 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 16 16 … … 30 30 \usepackage{upquote} % switch curled `'" to straight 31 31 \usepackage{calc} 32 \usepackage{xspace}33 32 \usepackage{varioref} % extended references 34 \usepackage{listings} % format program code 33 \usepackage[labelformat=simple,aboveskip=0pt,farskip=0pt]{subfig} 34 \renewcommand{\thesubfigure}{\alph{subfigure})} 35 35 \usepackage[flushmargin]{footmisc} % support label/reference in footnote 36 36 \usepackage{latexsym} % \Box glyph 37 37 \usepackage{mathptmx} % better math font with "times" 38 38 \usepackage[usenames]{color} 39 \input{common} % common CFA document macros 40 \usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref} 41 \usepackage{breakurl} 42 43 \usepackage[pagewise]{lineno} 44 \renewcommand{\linenumberfont}{\scriptsize\sffamily} 45 \usepackage[firstpage]{draftwatermark} 46 \SetWatermarkLightness{0.9} 47 48 % Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore 49 % removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR 50 % AFTER HYPERREF. 51 \renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}} 52 53 \setlength{\topmargin}{-0.45in} % move running title into header 54 \setlength{\headsep}{0.25in} 55 56 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 57 58 \CFAStyle % use default CFA format-style 59 \lstnewenvironment{C++}[1][] % use C++ style 60 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}} 61 {} 62 39 \newcommand{\CFALatin}{} 63 40 % inline code ©...© (copyright symbol) emacs: C-q M-) 64 41 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-. … … 68 45 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^ 69 46 % math escape $...$ (dollar symbol) 47 \input{common} % common CFA document macros 48 \usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref} 49 \usepackage{breakurl} 50 51 \renewcommand\footnoterule{\kern -3pt\rule{0.3\linewidth}{0.15pt}\kern 2pt} 52 53 \usepackage[pagewise]{lineno} 54 \renewcommand{\linenumberfont}{\scriptsize\sffamily} 55 \usepackage[firstpage]{draftwatermark} 56 \SetWatermarkLightness{0.9} 57 58 % Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore 59 % removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR 60 % AFTER HYPERREF. 61 \renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}} 62 63 \setlength{\topmargin}{-0.45in} % move running title into header 64 \setlength{\headsep}{0.25in} 65 66 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 67 68 \CFAStyle % use default CFA format-style 69 \lstnewenvironment{C++}[1][] % use C++ style 70 {\lstset{language=C++,moredelim=**[is][\protect\color{red}]{®}{®},#1}} 71 {} 72 73 \newsavebox{\myboxA} 74 \newsavebox{\myboxB} 70 75 71 76 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% … … 79 84 \newcommand{\G}[1]{{\Textbf[OliveGreen]{#1}}} 80 85 \newcommand{\KWC}{K-W C\xspace} 81 82 \newsavebox{\LstBox}83 86 84 87 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% … … 253 256 254 257 The signature feature of \CFA is \emph{\Index{overload}able} \Index{parametric-polymorphic} functions~\cite{forceone:impl,Cormack90,Duggan96} with functions generalized using a ©forall© clause (giving the language its name): 255 \begin{ lstlisting}258 \begin{cfa} 256 259 ®forall( otype T )® T identity( T val ) { return val; } 257 260 int forty_two = identity( 42 ); §\C{// T is bound to int, forty\_two == 42}§ 258 \end{ lstlisting}261 \end{cfa} 259 262 % extending the C type system with parametric polymorphism and overloading, as opposed to the \Index*[C++]{\CC{}} approach of object-oriented extensions. 260 263 \CFA{}\hspace{1pt}'s polymorphism was originally formalized by \Index*{Glen Ditchfield}\index{Ditchfield, Glen}~\cite{Ditchfield92}, and first implemented by \Index*{Richard Bilson}\index{Bilson, Richard}~\cite{Bilson03}. … … 275 278 \begin{comment} 276 279 A simple example is leveraging the existing type-unsafe (©void *©) C ©bsearch© to binary search a sorted floating array: 277 \begin{ lstlisting}280 \begin{cfa} 278 281 void * bsearch( const void * key, const void * base, size_t dim, size_t size, 279 282 int (* compar)( const void *, const void * )); … … 284 287 double key = 5.0, vals[10] = { /* 10 sorted floating values */ }; 285 288 double * val = (double *)bsearch( &key, vals, 10, sizeof(vals[0]), comp ); §\C{// search sorted array}§ 286 \end{ lstlisting}289 \end{cfa} 287 290 which can be augmented simply with a polymorphic, type-safe, \CFA-overloaded wrappers: 288 \begin{ lstlisting}291 \begin{cfa} 289 292 forall( otype T | { int ?<?( T, T ); } ) T * bsearch( T key, const T * arr, size_t size ) { 290 293 int comp( const void * t1, const void * t2 ) { /* as above with double changed to T */ } … … 297 300 double * val = bsearch( 5.0, vals, 10 ); §\C{// selection based on return type}§ 298 301 int posn = bsearch( 5.0, vals, 10 ); 299 \end{ lstlisting}302 \end{cfa} 300 303 The nested function ©comp© provides the hidden interface from typed \CFA to untyped (©void *©) C, plus the cast of the result. 301 304 Providing a hidden ©comp© function in \CC is awkward as lambdas do not use C calling-conventions and template declarations cannot appear at block scope. … … 305 308 \CFA has replacement libraries condensing hundreds of existing C functions into tens of \CFA overloaded functions, all without rewriting the actual computations. 306 309 For example, it is possible to write a type-safe \CFA wrapper ©malloc© based on the C ©malloc©: 307 \begin{ lstlisting}310 \begin{cfa} 308 311 forall( dtype T | sized(T) ) T * malloc( void ) { return (T *)malloc( sizeof(T) ); } 309 312 int * ip = malloc(); §\C{// select type and size from left-hand side}§ 310 313 double * dp = malloc(); 311 314 struct S {...} * sp = malloc(); 312 \end{ lstlisting}315 \end{cfa} 313 316 where the return type supplies the type/size of the allocation, which is impossible in most type systems. 314 317 \end{comment} … … 943 946 the same level as a ©case© clause; the target label may be case ©default©, but only associated 944 947 with the current ©switch©/©choose© statement. 945 946 947 \subsection{Loop Control}948 949 The ©for©/©while©/©do-while© loop-control allows empty or simplified ranges (see Figure~\ref{f:LoopControlExamples}).950 \begin{itemize}951 \item952 The loop index is polymorphic in the type of the comparison value N (when the start value is implicit) or the start value M.953 \item954 An empty conditional implies comparison value of ©1© (true).955 \item956 A comparison N is implicit up-to exclusive range [0,N©®)®©.957 \item958 A comparison ©=© N is implicit up-to inclusive range [0,N©®]®©.959 \item960 The up-to range M ©~©\index{~@©~©} N means exclusive range [M,N©®)®©.961 \item962 The up-to range M ©~=©\index{~=@©~=©} N means inclusive range [M,N©®]®©.963 \item964 The down-to range M ©-~©\index{-~@©-~©} N means exclusive range [N,M©®)®©.965 \item966 The down-to range M ©-~=©\index{-~=@©-~=©} N means inclusive range [N,M©®]®©.967 \item968 ©0© is the implicit start value;969 \item970 ©1© is the implicit increment value.971 \item972 The up-to range uses operator ©+=© for increment;973 \item974 The down-to range uses operator ©-=© for decrement.975 \item976 ©@© means put nothing in this field.977 \item978 ©:© means start another index.979 \end{itemize}980 948 981 949 \begin{figure} … … 1086 1054 1087 1055 1056 \subsection{Loop Control} 1057 1058 The ©for©/©while©/©do-while© loop-control allows empty or simplified ranges (see Figure~\ref{f:LoopControlExamples}). 1059 \begin{itemize} 1060 \item 1061 The loop index is polymorphic in the type of the comparison value N (when the start value is implicit) or the start value M. 1062 \item 1063 An empty conditional implies comparison value of ©1© (true). 1064 \item 1065 A comparison N is implicit up-to exclusive range [0,N©®)®©. 1066 \item 1067 A comparison ©=© N is implicit up-to inclusive range [0,N©®]®©. 1068 \item 1069 The up-to range M ©~©\index{~@©~©} N means exclusive range [M,N©®)®©. 1070 \item 1071 The up-to range M ©~=©\index{~=@©~=©} N means inclusive range [M,N©®]®©. 1072 \item 1073 The down-to range M ©-~©\index{-~@©-~©} N means exclusive range [N,M©®)®©. 1074 \item 1075 The down-to range M ©-~=©\index{-~=@©-~=©} N means inclusive range [N,M©®]®©. 1076 \item 1077 ©0© is the implicit start value; 1078 \item 1079 ©1© is the implicit increment value. 1080 \item 1081 The up-to range uses operator ©+=© for increment; 1082 \item 1083 The down-to range uses operator ©-=© for decrement. 1084 \item 1085 ©@© means put nothing in this field. 1086 \item 1087 ©:© means start another index. 1088 \end{itemize} 1089 1090 1088 1091 %\subsection{\texorpdfstring{Labelled \protect\lstinline@continue@ / \protect\lstinline@break@}{Labelled continue / break}} 1089 1092 \subsection{\texorpdfstring{Labelled \LstKeywordStyle{continue} / \LstKeywordStyle{break} Statement}{Labelled continue / break Statement}} … … 1095 1098 for ©break©, the target label can also be associated with a ©switch©, ©if© or compound (©{}©) statement. 1096 1099 \VRef[Figure]{f:MultiLevelExit} shows ©continue© and ©break© indicating the specific control structure, and the corresponding C program using only ©goto© and labels. 1097 The innermost loop has 7exit points, which cause continuation or termination of one or more of the 7 \Index{nested control-structure}s.1100 The innermost loop has 8 exit points, which cause continuation or termination of one or more of the 7 \Index{nested control-structure}s. 1098 1101 1099 1102 \begin{figure} 1100 \begin{tabular}{@{\hspace{\parindentlnth}}l@{\hspace{\parindentlnth}}l@{\hspace{\parindentlnth}}l@{}} 1101 \multicolumn{1}{@{\hspace{\parindentlnth}}c@{\hspace{\parindentlnth}}}{\textbf{\CFA}} & \multicolumn{1}{@{\hspace{\parindentlnth}}c}{\textbf{C}} \\ 1102 \begin{cfa} 1103 ®LC:® { 1104 ... §declarations§ ... 1105 ®LS:® switch ( ... ) { 1106 case 3: 1107 ®LIF:® if ( ... ) { 1108 ®LF:® for ( ... ) { 1109 ®LW:® while ( ... ) { 1110 ... break ®LC®; ... 1111 ... break ®LS®; ... 1112 ... break ®LIF®; ... 1113 ... continue ®LF;® ... 1114 ... break ®LF®; ... 1115 ... continue ®LW®; ... 1116 ... break ®LW®; ... 1117 } // while 1118 } // for 1119 } else { 1120 ... break ®LIF®; ... 1121 } // if 1122 } // switch 1103 \centering 1104 \begin{lrbox}{\myboxA} 1105 \begin{cfa}[tabsize=3] 1106 ®Compound:® { 1107 ®Try:® try { 1108 ®For:® for ( ... ) { 1109 ®While:® while ( ... ) { 1110 ®Do:® do { 1111 ®If:® if ( ... ) { 1112 ®Switch:® switch ( ... ) { 1113 case 3: 1114 ®break Compound®; 1115 ®break Try®; 1116 ®break For®; /* or */ ®continue For®; 1117 ®break While®; /* or */ ®continue While®; 1118 ®break Do®; /* or */ ®continue Do®; 1119 ®break If®; 1120 ®break Switch®; 1121 } // switch 1122 } else { 1123 ... ®break If®; ... // terminate if 1124 } // if 1125 } while ( ... ); // do 1126 } // while 1127 } // for 1128 } ®finally® { // always executed 1129 } // try 1123 1130 } // compound 1124 1131 \end{cfa} 1125 & 1126 \begin{cfa} 1132 \end{lrbox} 1133 1134 \begin{lrbox}{\myboxB} 1135 \begin{cfa}[tabsize=3] 1127 1136 { 1128 ... §declarations§ ... 1129 switch ( ... ) { 1130 case 3: 1131 if ( ... ) { 1132 for ( ... ) { 1133 while ( ... ) { 1134 ... goto ®LC®; ... 1135 ... goto ®LS®; ... 1136 ... goto ®LIF®; ... 1137 ... goto ®LFC®; ... 1138 ... goto ®LFB®; ... 1139 ... goto ®LWC®; ... 1140 ... goto ®LWB®; ... 1141 ®LWC®: ; } ®LWB:® ; 1142 ®LFC:® ; } ®LFB:® ; 1143 } else { 1144 ... goto ®LIF®; ... 1145 } ®L3:® ; 1146 } ®LS:® ; 1147 } ®LC:® ; 1148 \end{cfa} 1149 & 1150 \begin{cfa} 1151 1152 1153 1154 1155 1156 1157 1158 // terminate compound 1159 // terminate switch 1160 // terminate if 1161 // continue loop 1162 // terminate loop 1163 // continue loop 1164 // terminate loop 1165 1166 1167 1168 // terminate if 1169 1170 1171 1172 \end{cfa} 1173 \end{tabular} 1137 1138 ®ForC:® for ( ... ) { 1139 ®WhileC:® while ( ... ) { 1140 ®DoC:® do { 1141 if ( ... ) { 1142 switch ( ... ) { 1143 case 3: 1144 ®goto Compound®; 1145 ®goto Try®; 1146 ®goto ForB®; /* or */ ®goto ForC®; 1147 ®goto WhileB®; /* or */ ®goto WhileC®; 1148 ®goto DoB®; /* or */ ®goto DoC®; 1149 ®goto If®; 1150 ®goto Switch®; 1151 } ®Switch:® ; 1152 } else { 1153 ... ®goto If®; ... // terminate if 1154 } ®If:®; 1155 } while ( ... ); ®DoB:® ; 1156 } ®WhileB:® ; 1157 } ®ForB:® ; 1158 1159 1160 } ®Compound:® ; 1161 \end{cfa} 1162 \end{lrbox} 1163 1164 \subfloat[\CFA]{\label{f:CFibonacci}\usebox\myboxA} 1165 \hspace{2pt} 1166 \vrule 1167 \hspace{2pt} 1168 \subfloat[C]{\label{f:CFAFibonacciGen}\usebox\myboxB} 1174 1169 \caption{Multi-level Exit} 1175 1170 \label{f:MultiLevelExit} … … 1426 1421 try { 1427 1422 f(...); 1428 } catch( E e ; §boolean-predicate§ ) { §\C [8cm]{// termination handler}§1423 } catch( E e ; §boolean-predicate§ ) { §\C{// termination handler}§ 1429 1424 // recover and continue 1430 } catchResume( E e ; §boolean-predicate§ ) { §\C{// resumption handler} \CRT§1425 } catchResume( E e ; §boolean-predicate§ ) { §\C{// resumption handler}§ 1431 1426 // repair and return 1432 1427 } finally { … … 3491 3486 For implicit formatted input, the common case is reading a sequence of values separated by whitespace, where the type of an input constant must match with the type of the input variable. 3492 3487 \begin{cquote} 3493 \begin{lrbox}{\ LstBox}3488 \begin{lrbox}{\myboxA} 3494 3489 \begin{cfa}[aboveskip=0pt,belowskip=0pt] 3495 3490 int x; double y char z; … … 3497 3492 \end{lrbox} 3498 3493 \begin{tabular}{@{}l@{\hspace{3em}}l@{\hspace{3em}}l@{}} 3499 \multicolumn{1}{@{}l@{}}{\usebox\ LstBox} \\3494 \multicolumn{1}{@{}l@{}}{\usebox\myboxA} \\ 3500 3495 \multicolumn{1}{c@{\hspace{2em}}}{\textbf{\CFA}} & \multicolumn{1}{c@{\hspace{2em}}}{\textbf{\CC}} & \multicolumn{1}{c}{\textbf{Python}} \\ 3501 3496 \begin{cfa}[aboveskip=0pt,belowskip=0pt] … … 6672 6667 For example, an initial alignment and fill capability are preserved during a resize copy so the copy has the same alignment and extended storage is filled. 6673 6668 Without sticky properties it is dangerous to use ©realloc©, resulting in an idiom of manually performing the reallocation to maintain correctness. 6669 \begin{cfa} 6670 6671 \end{cfa} 6674 6672 6675 6673 \CFA memory management extends allocation to support constructors for initialization of allocated storage, \eg in … … 6721 6719 6722 6720 // §\CFA§ safe general allocation, fill, resize, alignment, array 6723 T * alloc( void );§\indexc{alloc}§ 6724 T * alloc( size_t dim ); 6725 T * alloc( T ptr[], size_t dim ); 6726 T * alloc_set( char fill );§\indexc{alloc_set}§ 6727 T * alloc_set( T fill ); 6728 T * alloc_set( size_t dim, char fill ); 6729 T * alloc_set( size_t dim, T fill ); 6730 T * alloc_set( size_t dim, const T fill[] ); 6731 T * alloc_set( T ptr[], size_t dim, char fill ); 6732 6733 T * alloc_align( size_t align ); 6734 T * alloc_align( size_t align, size_t dim ); 6735 T * alloc_align( T ptr[], size_t align ); // aligned realloc array 6736 T * alloc_align( T ptr[], size_t align, size_t dim ); // aligned realloc array 6737 T * alloc_align_set( size_t align, char fill ); 6738 T * alloc_align_set( size_t align, T fill ); 6739 T * alloc_align_set( size_t align, size_t dim, char fill ); 6740 T * alloc_align_set( size_t align, size_t dim, T fill ); 6741 T * alloc_align_set( size_t align, size_t dim, const T fill[] ); 6742 T * alloc_align_set( T ptr[], size_t align, size_t dim, char fill ); 6721 T * alloc( void );§\indexc{alloc}§ §\C[3.5in]{// variable, T size}§ 6722 T * alloc( size_t dim ); §\C{// array[dim], T size elements}§ 6723 T * alloc( T ptr[], size_t dim ); §\C{// realloc array[dim], T size elements}§ 6724 6725 T * alloc_set( char fill );§\indexc{alloc_set}§ §\C{// variable, T size, fill bytes with value}§ 6726 T * alloc_set( T fill ); §\C{// variable, T size, fill with value}§ 6727 T * alloc_set( size_t dim, char fill ); §\C{// array[dim], T size elements, fill bytes with value}§ 6728 T * alloc_set( size_t dim, T fill ); §\C{// array[dim], T size elements, fill elements with value}§ 6729 T * alloc_set( size_t dim, const T fill[] ); §\C{// array[dim], T size elements, fill elements with array}§ 6730 T * alloc_set( T ptr[], size_t dim, char fill ); §\C{// realloc array[dim], T size elements, fill bytes with value}§ 6731 6732 T * alloc_align( size_t align ); §\C{// aligned variable, T size}§ 6733 T * alloc_align( size_t align, size_t dim ); §\C{// aligned array[dim], T size elements}§ 6734 T * alloc_align( T ptr[], size_t align ); §\C{// realloc new aligned array}§ 6735 T * alloc_align( T ptr[], size_t align, size_t dim ); §\C{// realloc new aligned array[dim]}§ 6736 6737 T * alloc_align_set( size_t align, char fill ); §\C{// aligned variable, T size, fill bytes with value}§ 6738 T * alloc_align_set( size_t align, T fill ); §\C{// aligned variable, T size, fill with value}§ 6739 T * alloc_align_set( size_t align, size_t dim, char fill ); §\C{// aligned array[dim], T size elements, fill bytes with value}§ 6740 T * alloc_align_set( size_t align, size_t dim, T fill ); §\C{// aligned array[dim], T size elements, fill elements with value}§ 6741 T * alloc_align_set( size_t align, size_t dim, const T fill[] ); §\C{// aligned array[dim], T size elements, fill elements with array}§ 6742 T * alloc_align_set( T ptr[], size_t align, size_t dim, char fill ); §\C{// realloc new aligned array[dim], fill new bytes with value}§ 6743 6743 6744 6744 // §\CFA§ safe initialization/copy, i.e., implicit size specification
Note:
See TracChangeset
for help on using the changeset viewer.