Changeset 53bb8f1 for doc


Ignore:
Timestamp:
Mar 12, 2019, 3:00:54 PM (3 years ago)
Author:
Aaron Moss <a3moss@…>
Branches:
aaron-thesis, arm-eh, cleanup-dtors, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr
Children:
30e32b2, a2545593
Parents:
9d9a451 (diff), 91d6584 (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge second draft of Aaron's thesis

Location:
doc
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • doc/LaTeXmacros/lstlang.sty

    r9d9a451 r53bb8f1  
    88%% Created On       : Sat May 13 16:34:42 2017
    99%% Last Modified By : Peter A. Buhr
    10 %% Last Modified On : Fri Apr  6 23:44:50 2018
    11 %% Update Count     : 20
     10%% Last Modified On : Tue Jan  8 14:40:33 2019
     11%% Update Count     : 21
    1212%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    1313
     
    114114                _Alignas, _Alignof, __alignof, __alignof__, asm, __asm, __asm__, __attribute, __attribute__,
    115115                auto, _Bool, catch, catchResume, choose, _Complex, __complex, __complex__, __const, __const__,
    116                 coroutine, disable, dtype, enable, __extension__, exception, fallthrough, fallthru, finally,
     116                coroutine, disable, dtype, enable, exception, __extension__, fallthrough, fallthru, finally,
    117117                __float80, float80, __float128, float128, forall, ftype, _Generic, _Imaginary, __imag, __imag__,
    118118                inline, __inline, __inline__, __int128, int128, __label__, monitor, mutex, _Noreturn, one_t, or,
  • doc/bibliography/pl.bib

    r9d9a451 r53bb8f1  
    330330    contributer = {pabuhr@plg},
    331331    author      = {Nissim Francez},
    332     title       = {Another Advantage of Key word Notation for Parameter Communication with Subprograms},
     332    title       = {Another Advantage of Keyword Notation for Parameter Communication with Subprograms},
    333333    journal     = cacm,
    334334    volume      = 20,
     
    831831    year        = 2015,
    832832    howpublished= {\href{http://www.boost.org/doc/libs/1_61_0/libs/coroutine/doc/html/index.html}
    833                   {{http://www.boost.org/\-doc/\-libs/1\_61\_0/\-libs/\-coroutine/\-doc/\-html/\-index.html}}},
    834     optnote     = {Accessed: 2016-09},
     833                  {http://www.boost.org/\-doc/\-libs/1\_61\_0/\-libs/\-coroutine/\-doc/\-html/\-index.html}},
     834}
     835
     836@misc{BoostThreads,
     837    keywords    = {Boost Thread Library},
     838    contributer = {pabuhr@plg},
     839    author      = {Anthony Williams and Vicente J. Botet Escriba},
     840    title       = {Boost Thread Library},
     841    year        = 2015,
     842    howpublished= {\href{https://www.boost.org/doc/libs/1_61_0/doc/html/thread.html}
     843                  {https://\-www.boost.org/\-doc/\-libs/\-1\_61\_0/\-doc/\-html/\-thread.html}},
    835844}
    836845
     
    939948    author      = {{\textsf{C}{$\mathbf{\forall}$} Features}},
    940949    howpublished= {\href{https://plg.uwaterloo.ca/~cforall/features}{https://\-plg.uwaterloo.ca/\-$\sim$cforall/\-features}},
    941     optnote     = {Accessed: 2018-01-01},
    942950}
    943951
     
    959967    year        = 2018,
    960968    howpublished= {\href{https://cforall.uwaterloo.ca/CFAStackEvaluation.zip}{https://cforall.uwaterloo.ca/\-CFAStackEvaluation.zip}},
    961     optnote     = {[Accessed May 2018]},
    962969}
    963970
     
    11341141}
    11351142
    1136 @Inproceedings{Tarditi18,
    1137     keywords = {Checked C},
    1138     contributer = {a3moss@uwaterloo.ca},
    1139     author = {Tarditi, David and Elliott, Archibald Samuel and Ruef, Andrew and Hicks, Michael},
    1140     title = {Checked C: Making C Safe by Extension},
     1143@inproceedings{Tarditi18,
     1144    keywords    = {Checked C},
     1145    contributer = {a3moss@uwaterloo.ca},
     1146    author      = {Tarditi, David and Elliott, Archibald Samuel and Ruef, Andrew and Hicks, Michael},
     1147    title       = {Checked C: Making C Safe by Extension},
     1148    booktitle   = {2018 IEEE Cybersecurity Development (SecDev)}
    11411149    year = {2018},
    11421150    month = {September},
     1151    pages = {53-60},
    11431152    publisher = {IEEE},
    11441153    url = {https://www.microsoft.com/en-us/research/publication/checkedc-making-c-safe-by-extension/},
    1145     pages = {53-60},
    11461154}
    11471155
     
    13141322    journal     = sigplan,
    13151323    year        = 1986,
    1316     month       = oct, volume = 21, number = 10, pages = {19-28},
     1324    month       = oct,
     1325    volume      = 21,
     1326    number      = 10,
     1327    pages       = {19-28},
    13171328    note        = {Object Oriented Programming Workshop}
    13181329}
     
    14791490    title       = {concurrent-locking},
    14801491    howpublished= {\href{https://github.com/pabuhr/concurrent-locking}{https://\-github.com/\-pabuhr/\-concurrent-locking}},
    1481     optnote     = {[Accessed April 2017]},
    14821492}
    14831493
     
    17671777    howpublished= {\href{https://www.airs.com/blog/archives/428}
    17681778                  {https://www.airs.com/\-blog/\-archives/\-428}},
    1769     optnote     = {Accessed: 2018-05},
    17701779}
    17711780
     
    29562965    year        = 2014,
    29572966    howpublished= {\href{https://gcc.gnu.org/onlinedocs/gcc-4.7.2/gcc/C-Extensions.html}{https://\-gcc.gnu.org/\-onlinedocs/\-gcc-4.7.2/\-gcc/\-C\-Extensions.html}},
    2958     optnote     = {Accessed: 2017-04-02},
    29592967}
    29602968
     
    30403048}
    30413049
     3050@manual{WindowsFibers,
     3051    keywords    = {threads, fibers},
     3052    contributer = {pabuhr@plg},
     3053    author      = {Windows},
     3054    title       = {Fibers},
     3055    organization= {Microsoft, Windows Development Center},
     3056    address     = {\href{https://docs.microsoft.com/en-us/windows/desktop/ProcThread/fibers}{https://\-docs.microsoft.com/\-en-us/\-windows/\-desktop/\-ProcThread/\-fibers}},
     3057    year        = 2018,
     3058}
     3059
    30423060@inproceedings{F-bound,
    30433061    keywords    = {},
     
    30873105}
    30883106
     3107@manual{Folly,
     3108    keywords    = {Folly},
     3109    contributer = {pabuhr@plg},
     3110    author      = {Folly},
     3111    title       = {Facebook Open-source Library},
     3112    organization= {Facebook},
     3113    address     = {\href{https://github.com/facebook/folly}{https://\-github.com/\-facebook/\-folly}},
     3114    year        = 2018,
     3115}
     3116
    30893117@manual{Fortran95,
    30903118    keywords    = {Fortran 95},
     
    31073135    address     = {\href{https://www.iso.org/standard/50459.html}{https://\-www.iso.org/\-standard/\-50459.html}},
    31083136    year        = 2010,
     3137}
     3138
     3139@manual{Fortran18,
     3140    keywords    = {ISO/IEC Fortran 10},
     3141    contributer = {pabuhr@plg},
     3142    author      = {Fortran18},
     3143    title       = {Programming Languages -- {Fortran} Part 1:Base Language ISO/IEC 1539-1:2018},
     3144    edition     = {4rd},
     3145    publisher   = {International Standard Organization},
     3146    address     = {\href{https://www.iso.org/standard/72320.html}{https://\-www.iso.org/\-standard/\-72320.html}},
     3147    year        = 2018,
    31093148}
    31103149
     
    33563395    year        = 2014,
    33573396    howpublished= {https://developer.gnome.org/gobject/stable/},
    3358     optnote     = {Accessed: 2017-04},
    33593397}
    33603398
     
    36713709    year        = {1964},
    36723710    publisher   = {ACM}
     3711}
     3712
     3713@phdthesis{Barghi18,
     3714    keywords    = {concurrency, user threads, actors},
     3715    contributer = {pabuhr@plg},
     3716    author      = {Saman Barghi},
     3717    title       = {Improving the Performance of User-level Runtime Systems for Concurrent Applications},
     3718    school      = {School of Computer Science, University of Waterloo},
     3719    year        = 2018,
     3720    month       = sep,
     3721    optaddress  = {Waterloo, Ontario, Canada, N2L 3G1},
     3722    note        = {\href{https://uwspace.uwaterloo.ca/handle/10012/13935}{https://\-uwspace.uwaterloo.ca/\-handle/\-10012/\-13935}},
    36733723}
    36743724
     
    39984048    year        = 2015,
    39994049    edition     = {{J}ava {SE} 8},
     4050}
     4051
     4052@manual{Java11,
     4053    keywords    = {Java SE 11},
     4054    contributer = {pabuhr@plg},
     4055    author      = {James Gosling and Bill Joy and Guy Steele and Gilad Bracha and Alex Buckley and Daniel Smith},
     4056    title       = {{Java} Language Specification},
     4057    publisher   = {Oracle},
     4058    month       = sep,
     4059    year        = 2018,
     4060    edition     = {{J}ava {SE} 11},
     4061}
     4062
     4063@manual{JDK1.1,
     4064    keywords    = {JDK 1.1},
     4065    contributer = {pabuhr@plg},
     4066    author      = {{Multithreading Models}},
     4067    title       = {JDK 1.1 for Solaris Developer's Guide},
     4068    publisher   = {Oracle},
     4069    address     = {\href{https://docs.oracle.com/cd/E19455-01/806-3461/6jck06gqk/index.html#ch2mt-41}{https://\-docs.oracle.com/\-cd/\-E19455-01/\-806-3461/\-6jck06gqk/\-index.html\#ch2mt-41}},
     4070    year        = 2010,
    40004071}
    40014072
     
    41794250}
    41804251
     4252@manual{libmill,
     4253    keywords    = {libmill},
     4254    contributer = {pabuhr@plg},
     4255    author      = {libmill},
     4256    title       = {{G}o-style concurrency in {C}, Version 1.18},
     4257    organization= {libmill},
     4258    address     = {\href{http://libmill.org/documentation.html}{http://\-libmill.org/\-documentation.html}},
     4259    month       = jan,
     4260    year        = 2017,
     4261}
     4262
    41814263@book{Weissman67,
    41824264    keywords    = {lisp},
     
    42244306    pages       = {161-169},
    42254307    note        = {Proceedings of the {SIGPLAN}~'89 Conference on Programming Language Design and Implementation}
     4308}
     4309
     4310@manual{Lua,
     4311    keywords    = {Lua},
     4312    contributer = {pabuhr@plg},
     4313    author      = {Lua},
     4314    title       = {Lua 5.3 Reference Manual},
     4315    address     = {\href{https://www.lua.org/manual/5.3}{https://\-www.lua.org/\-manual/\-5.3}},
     4316    year        = 2018,
    42264317}
    42274318
     
    45664657}
    45674658%    editor     = {Allen Kent and James G. Williams},
     4659
     4660@incollection{MPC,
     4661    keywords    = {user-level threading},
     4662    contributer = {pabuhr@plg},
     4663    author      = {Marc P\'erache and Herv\'e Jourdren and Raymond Namyst},
     4664    title       = {MPC: A Unified Parallel Runtime for Clusters of {NUMA} Machines},
     4665    booktitle   = {Euro-Par 2008},
     4666    pages       = {329-342},
     4667    publisher   = {Springer},
     4668    address     = {Berlin, Heidelberg},
     4669    year        = 2008,
     4670    volume      = 5168,
     4671    series      = {Lecture Notes in Computer Science},
     4672}
    45684673
    45694674@manual{MPI,
     
    49925097    year        = 2014,
    49935098    howpublished= {\href{https://developer.apple.com/library/archive/documentation/Cocoa/Conceptual/ProgrammingWithObjectiveC}{https://\-developer.apple.com/\-library/archive/\-documentation/\-Cocoa/\-Conceptual/\-ProgrammingWithObjectiveC}},
    4994     optnote     = {Accessed: 2018-03}
    49955099}
    49965100
     
    50025106    year        = 2015,
    50035107    howpublished= {\href{https://developer.apple.com/library/content/documentation/Xcode/Conceptual/RN-Xcode-Archive/Chapters/xc7_release_notes.html}{https://\-developer.apple.com/\-library/\-content/\-documentation/\-Xcode/\-Conceptual/\-RN-Xcode-Archive/\-Chapters/\-xc7\_release\_notes.html}},
    5004     optnote     = {Accessed: 2017-04}
    50055108}
    50065109
     
    55155618    year        = 2012,
    55165619    howpublished= {\href{http://cs.brown.edu/research/pubs/theses/masters/2012/verch.pdf}{http://cs.brown.edu/\-research/\-pubs/\-theses/\-masters/\-2012/\-verch.pdf}},
    5517     optnote     = {Accessed: 2013-10-4}
    55185620}
    55195621
     
    58395941    address     = {\href{https://www.iso.org/standard/64029.html}{https://\-www.iso.org/\-standard/\-64029.html}},
    58405942    year        = 2014,
     5943}
     5944
     5945@manual{C++17,
     5946    keywords    = {ISO/IEC C++ 17},
     5947    contributer = {pabuhr@plg},
     5948    key         = {C++17},
     5949    title       = {{C}{\kern-.1em\hbox{\large\texttt{+\kern-.25em+}}} Programming Language ISO/IEC 14882:2017},
     5950    edition     = {5th},
     5951    publisher   = {International Standard Organization},
     5952    address     = {\href{https://www.iso.org/standard/68564.html}{https://\-www.iso.org/\-standard/\-68564.html}},
     5953    year        = 2017,
    58415954}
    58425955
     
    59926105    institution = {Carnegie Mellon University},
    59936106    year        = 1991,
    5994     month       = feb, number = "CMU-CS-91-106",
     6107    month       = feb,
     6108    number      = {CMU-CS-91-106},
    59956109    annote      = {
    59966110        Discusses a typed lambda calculus with
     
    60496163    journal     = sigplan,
    60506164    year        = 1988,
    6051     month       = jul, volume = 23, number = 7, pages = {260-267},
    6052     note        = {Proceedings of the SIGPLAN '88 Conference on Programming Language
    6053          Design and Implementation},
     6165    month       = jul,
     6166    volume      = 23,
     6167    number      = 7,
     6168    pages       = {260-267},
     6169    note        = {Proceedings of the SIGPLAN '88 Conference on Programming Language Design and Implementation},
    60546170    abstract    = {
    60556171        This paper deals with the integration of an efficient asynchronous
     
    61016217}
    61026218
     6219@misc{Pthreads,
     6220    keywords    = {pthreads, C concurrency},
     6221    contributer = {pabuhr@plg},
     6222    key         = {pthreads},
     6223    title       = {{Pthread}.h, Specifications Issue 7, {IEEE} Std 1003.1-2017},
     6224    author      = {IEEE and {The Open Group}},
     6225    year        = 2018,
     6226    howpublished= {\href{http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/pthread.h.html}
     6227                  {http://\-pubs.opengroup.org/\-onlinepubs/\-9699919799/\-basedefs/\-pthread.h.html}},
     6228}
     6229
    61036230@manual{Python,
    61046231    keywords    = {Python},
    61056232    contributer = {pabuhr@plg},
    6106     title       = {Python Reference Manual, Release 2.5},
    6107     author      = {Guido van Rossum},
     6233    author      = {Python},
     6234    title       = {Python Language Reference, Release 3.7.2},
    61086235    organization= {Python Software Foundation},
    6109     month       = sep,
    6110     year        = 2006,
    6111     note        = {Fred L. Drake, Jr., editor},
     6236    address     = {\href{https://docs.python.org/3/reference/index.html}{https://\-docs.python.org/\-3/\-reference/\-index.html}},
     6237    year        = 2018,
    61126238}
    61136239
    61146240% Q
     6241
     6242@inproceedings{Qthreads,
     6243    keywords    = {user-level threading},
     6244    author      = {Kyle B. Wheeler and Richard C. Murphy and Douglas Thain},
     6245    title       = {Qthreads: An API for Programming with Millions of Lightweight Threads},
     6246    booktitle   = {International Symposium on Parallel and Distributed Processing},
     6247    organization= {IEEE},
     6248    address     = {Miami, FL, USA},
     6249    month       = apr,
     6250    year        = 2008,
     6251}
    61156252
    61166253@article{Grossman06,
     
    61496286}
    61506287
     6288@manual{Quasar,
     6289    keywords    = {Quasar},
     6290    contributer = {pabuhr@plg},
     6291    author      = {Quasar},
     6292    title       = {Quasar Documentation, Release 0.8.0},
     6293    organization= {Parallel Universe},
     6294    address     = {\href{http://docs.paralleluniverse.co/quasar}{http://\-docs.paralleluniverse.co/\-quasar}},
     6295    year        = 2018,
     6296}
     6297
    61516298% R
    61526299
     
    62626409    number      = 10,
    62636410    pages       = {27-32},
     6411}
     6412
     6413@article{Hesselink06,
     6414    author      = {Wim H. Hesselink},
     6415    title       = {Refinement Verification of the Lazy Caching Algorithm},
     6416    journal     = acta,
     6417    year        = 2006,
     6418    month       = oct,
     6419    volume      = 43,
     6420    number      = 3,
     6421    pages       = {195--222},
    62646422}
    62656423
     
    64006558}
    64016559
     6560@manual{Ruby,
     6561    keywords    = {Ruby},
     6562    contributer = {pabuhr@plg},
     6563    author      = {Ruby},
     6564    title       = {Ruby Documentation, Release 2.6.0},
     6565    organization= {Python Software Foundation},
     6566    address     = {\href{https://www.ruby-lang.org/en/documentation}{https://\-www.ruby-lang.org/\-en/\-documentation}},
     6567    year        = 2018,
     6568}
     6569
    64026570% S
    64036571
     
    71907358    author      = {{TIOBE Index}},
    71917359    howpublished= {\href{http://www.tiobe.com/tiobe_index}{http://\-www.tiobe.com/\-tiobe\_index}},
    7192     optnote     = {Accessed: 2018-09},
     7360}
     7361
     7362@misc{ThreadModel,
     7363    contributer = {pabuhr@plg},
     7364    key         = {ThreadModel},
     7365    title       = {Thread (computing)},
     7366    author      = {{Threading Model}},
     7367    howpublished= {\href{https://en.wikipedia.org/wiki/Thread_(computing)}{https://\-en.wikipedia.org/\-wiki/\-Thread\_(computing)}},
    71937368}
    71947369
     
    75227697    year        = 2017,
    75237698    howpublished= {\url{https://wiki.gnome.org/Projects/Vala/Manual}},
    7524     optnote     = {Accessed: 2017-04}
    75257699}
    75267700
     
    76967870% Y
    76977871
     7872@article{Boehm12,
     7873    keywords    = {memory model, race condition},
     7874    contributer = {pabuhr@plg},
     7875    author      = {Boehm, Hans-J. and Adve, Sarita V.},
     7876    title       = {You Don'T Know Jack About Shared Variables or Memory Models},
     7877    journal     = cacm,
     7878    volume      = 55,
     7879    number      = 2,
     7880    month       = feb,
     7881    year        = 2012,
     7882    pages       = {48--54},
     7883    publisher   = {ACM},
     7884    address     = {New York, NY, USA},
     7885}
     7886
    76987887% Z
    76997888
  • doc/papers/concurrency/Paper.tex

    r9d9a451 r53bb8f1  
    228228}
    229229
    230 \title{\texorpdfstring{Concurrency in \protect\CFA}{Concurrency in Cforall}}
     230\title{\texorpdfstring{Advanced Control-flow in \protect\CFA}{Advanced Control-flow in Cforall}}
    231231
    232232\author[1]{Thierry Delisle}
     
    241241
    242242\abstract[Summary]{
    243 \CFA is a modern, polymorphic, \emph{non-object-oriented} extension of the C programming language.
    244 This paper discusses the design of the concurrency and parallelism features in \CFA, and its concurrent runtime-system.
    245 These features are created from scratch as ISO C lacks concurrency, relying largely on the pthreads library for concurrency.
    246 Coroutines and lightweight (user) threads are introduced into \CFA;
    247 as well, monitors are added as a high-level mechanism for mutual exclusion and synchronization.
    248 A unique contribution of this work is allowing multiple monitors to be safely acquired \emph{simultaneously}.
     243\CFA is a modern, polymorphic, non-object-oriented, backwards-compatible extension of the C programming language.
     244This paper discusses the advanced control-flow features in \CFA, which include concurrency and parallelism, and its supporting runtime system.
     245These features are created from scratch as ISO C's concurrency is low-level and unimplemented, so C programmers continue to rely on the C pthreads library.
     246\CFA provides high-level control-flow mechanisms, like coroutines and user-level threads, and monitors for mutual exclusion and synchronization.
     247A unique contribution of this work is allowing multiple monitors to be safely acquired \emph{simultaneously} (deadlock free), while integrating this capability with all monitor synchronization mechanisms.
    249248All features respect the expectations of C programmers, while being fully integrate with the \CFA polymorphic type-system and other language features.
    250249Experimental results show comparable performance of the new features with similar mechanisms in other concurrent programming-languages.
    251250}%
    252251
    253 \keywords{concurrency, parallelism, coroutines, threads, monitors, runtime, C, Cforall}
     252\keywords{coroutines, concurrency, parallelism, threads, monitors, runtime, C, \CFA (Cforall)}
    254253
    255254
     
    262261\section{Introduction}
    263262
     263This paper discusses the design of advanced, high-level control-flow extensions (especially concurrency and parallelism) in \CFA and its runtime.
     264\CFA is a modern, polymorphic, non-object-oriented\footnote{
     265\CFA has features often associated with object-oriented programming languages, such as constructors, destructors, virtuals and simple inheritance.
     266However, functions \emph{cannot} be nested in structures, so there is no lexical binding between a structure and set of functions (member/method) implemented by an implicit \lstinline@this@ (receiver) parameter.},
     267backwards-compatible extension of the C programming language~\cite{Moss18}.
     268Within the \CFA framework, new control-flow features were created from scratch.
     269ISO \Celeven defines only a subset of the \CFA extensions, and with respect to concurrency~\cite[\S~7.26]{C11}, the features are largely wrappers for a subset of the pthreads library~\cite{Butenhof97,Pthreads}.
     270Furthermore, \Celeven and pthreads concurrency is basic, based on thread fork/join in a function and a few locks, which is low-level and error prone;
     271no high-level language concurrency features exist.
     272Interestingly, almost a decade after publication of the \Celeven standard, neither gcc-8, clang-8 nor msvc-19 (most recent versions) support the \Celeven include @threads.h@, indicating little interest in the C concurrency approach.
     273Finally, while the \Celeven standard does not state a concurrent threading-model, the historical association with pthreads suggests the threading model is kernel-level threading (1:1)~\cite{ThreadModel}.
     274
     275In contrast, there has been a renewed interest during the past decade in user-level (M:N, green) threading in old and new programming languages.
     276As multi-core hardware became available in the 1980/90s, both user and kernel threading were examined.
     277Kernel threading was chosen, largely because of its simplicity and fit with the simpler operating systems and hardware architectures at the time, which gave it a performance advantage~\cite{Drepper03}.
     278Libraries like pthreads were developed for C, and the Solaris operating-system switched from user (JDK 1.1~\cite{JDK1.1}) to kernel threads.
     279As a result, languages like Java, Scala~\cite{Scala}, Objective-C~\cite{obj-c-book}, \CCeleven~\cite{C11}, and C\#~\cite{Csharp} adopted the 1:1 kernel-threading model, with a variety of presentation mechanisms.
     280From 2000 onwards, languages like Go~\cite{Go}, Erlang~\cite{Erlang}, Haskell~\cite{Haskell}, D~\cite{D}, and \uC~\cite{uC++,uC++book} have championed the M:N user-threading model, and many user-threading libraries have appeared~\cite{Qthreads,MPC,BoostThreads}, including putting green threads back into Java~\cite{Quasar}.
     281The main argument for user-level threading is that they are lighter weight than kernel threads (locking and context switching do not cross the kernel boundary), so there is less restriction on programming styles that encourage large numbers of threads performing smaller work-units to facilitate load balancing by the runtime~\cite{Verch12}.
     282As well, user-threading facilitates a simpler concurrency approach using thread objects that leverage sequential patterns versus events with call-backs~\cite{vonBehren03}.
     283Finally, performant user-threading implementations (both time and space) are largely competitive with direct kernel-threading implementations, while achieving the programming advantages of high concurrency levels and safety.
     284
     285A further effort over the past decade is the development of language memory-models to deal with the conflict between certain language features and compiler/hardware optimizations.
     286This issue can be rephrased as some features are pervasive (language and runtime) and cannot be safely added via a library to prevent invalidation by sequential optimizations~\cite{Buhr95a,Boehm05}.
     287The consequence is that a language must be cognizant of these features and provide sufficient tools to program around any safety issues.
     288For example, C created the @volatile@ qualifier to provide correct execution for @setjmp@/@logjmp@ (concurrency came later).
     289The simplest solution is to provide a handful of complex qualifiers and functions (e.g., @volatile@ and atomics) allowing programmers to write consistent/race-free programs, often in the sequentially-consistent memory-model~\cite{Boehm12}.
     290
     291While having a sufficient memory-model allows sound libraries to be constructed, writing these libraries can quickly become awkward and error prone, and using these low-level libraries has the same issues.
     292Essentially, using low-level explicit locks is the concurrent equivalent of assembler programming.
     293Just as most assembler programming is replaced with programming in a high-level language, explicit locks can be replaced with high-level concurrency constructs in a programming language.
     294The goal is to get the compiler to check for correct usage and follow any complex coding conventions implicitly.
     295The drawback is that language constructs may preclude certain specialized techniques, therefore introducing inefficiency or inhibiting concurrency.
     296For most concurrent programs, these drawbacks are insignificant in comparison to the speed of composition, and subsequent reliability and maintainability of the high-level concurrent program.
     297(The same is true for high-level programming versus assembler programming.)
     298Only very rarely should it be necessary to drop down to races and/or explicit locks to apply a specialized technique to achieve maximum speed or concurrency.
     299As stated, this observation applies to non-concurrent forms of complex control-flow, like exception handling and coroutines.
     300
     301Adapting the programming language allows matching the control-flow model with the programming-language style, versus adapting to one general (sound) library/paradigm.
     302For example, it is possible to provide exceptions, coroutines, monitors, and tasks as specialized types in an object-oriented language, integrating these constructs to allow leveraging the type-system (static type-checking) and all other object-oriented capabilities~\cite{uC++}.
     303It is also possible to leverage call/return for blocking communication via new control structures, versus switching to alternative communication paradigms, like channels or message passing.
     304As well, user threading is often a complementary feature, allowing light-weight threading to match with low-cost objects, while hiding the application/kernel boundary.
     305User threading also allows layering of implicit concurrency models (no explicit thread creation), such executors, data-flow, actors, into a single language, so programmers can chose the model that best fits an algorithm.\footnote{
     306All implicit concurrency models have explicit threading in their implementation, and hence, can be build from explicit threading;
     307however, the reverse is seldom true, i.e., given implicit concurrency, e.g., actors, it is virtually impossible to create explicit concurrency, e.g., blocking thread objects.}
     308Finally, with extended language features and user-level threading it is possible to discretely fold locking and non-blocking I/O multiplexing into the language's I/O libraries, so threading implicitly dovetails with the I/O subsystem.
     309
     310\CFA embraces language extensions and user-level threading to provide advanced control-flow and concurrency.
     311We attempt to show the \CFA extensions and runtime are demonstrably better than those proposed for \CC and other concurrent, imperative programming languages.
     312The contributions of this work are:
     313\begin{itemize}
     314\item
     315allowing multiple monitors to be safely acquired \emph{simultaneously} (deadlock free), while seamlessly integrating this capability with all monitor synchronization mechanisms.
     316\item
     317all control-flow features respect the expectations of C programmers, with statically type-safe interfaces that integrate with the \CFA polymorphic type-system and other language features.
     318\item
     319experimental results show comparable performance of the new features with similar mechanisms in other concurrent programming-languages.
     320\end{itemize}
     321
     322\begin{comment}
    264323This paper provides a minimal concurrency \newterm{Application Program Interface} (API) that is simple, efficient and can be used to build other concurrency features.
    265324While the simplest concurrency system is a thread and a lock, this low-level approach is hard to master.
     
    281340The proposed concurrency API is implemented in a dialect of C, called \CFA (pronounced C-for-all).
    282341The paper discusses how the language features are added to the \CFA translator with respect to parsing, semantics, and type checking, and the corresponding high-performance runtime-library to implement the concurrent features.
    283 
    284 
     342\end{comment}
     343
     344
     345\begin{comment}
    285346\section{\CFA Overview}
    286347
     
    551612\end{cfa}
    552613where the return type supplies the type/size of the allocation, which is impossible in most type systems.
    553 
    554 
    555 \section{Concurrency}
    556 \label{s:Concurrency}
    557 
    558 At its core, concurrency is based on multiple call-stacks and scheduling threads executing on these stacks.
    559 Multiple call stacks (or contexts) and a single thread of execution, called \newterm{coroutining}~\cite{Conway63,Marlin80}, does \emph{not} imply concurrency~\cite[\S~2]{Buhr05a}.
    560 In coroutining, the single thread is self-scheduling across the stacks, so execution is deterministic, \ie the execution path from input to output is fixed and predictable.
    561 A \newterm{stackless} coroutine executes on the caller's stack~\cite{Python} but this approach is restrictive, \eg preventing modularization and supporting only iterator/generator-style programming;
    562 a \newterm{stackful} coroutine executes on its own stack, allowing full generality.
    563 Only stackful coroutines are a stepping stone to concurrency.
    564 
    565 The transition to concurrency, even for execution with a single thread and multiple stacks, occurs when coroutines also context switch to a \newterm{scheduling oracle}, introducing non-determinism from the coroutine perspective~\cite[\S~3]{Buhr05a}.
    566 Therefore, a minimal concurrency system is possible using coroutines (see Section \ref{coroutine}) in conjunction with a scheduler to decide where to context switch next.
    567 The resulting execution system now follows a cooperative threading-model, called \newterm{non-preemptive scheduling}.
    568 
    569 Because the scheduler is special, it can either be a stackless or stackful coroutine.
    570 For stackless, the scheduler performs scheduling on the stack of the current coroutine and switches directly to the next coroutine, so there is one context switch.
    571 For stackful, the current coroutine switches to the scheduler, which performs scheduling, and it then switches to the next coroutine, so there are two context switches.
    572 A stackful scheduler is often used for simplicity and security.
    573 
    574 Regardless of the approach used, a subset of concurrency related challenges start to appear.
    575 For the complete set of concurrency challenges to occur, the missing feature is \newterm{preemption}, where context switching occurs randomly between any two instructions, often based on a timer interrupt, called \newterm{preemptive scheduling}.
    576 While a scheduler introduces uncertainty in the order of execution, preemption introduces uncertainty about where context switches occur.
    577 Interestingly, uncertainty is necessary for the runtime (operating) system to give the illusion of parallelism on a single processor and increase performance on multiple processors.
    578 The reason is that only the runtime has complete knowledge about resources and how to best utilized them.
    579 However, the introduction of unrestricted non-determinism results in the need for \newterm{mutual exclusion} and \newterm{synchronization} to restrict non-determinism for correctness;
    580 otherwise, it is impossible to write meaningful programs.
    581 Optimal performance in concurrent applications is often obtained by having as much non-determinism as correctness allows.
    582 
    583 An important missing feature in C is threading\footnote{While the C11 standard defines a \protect\lstinline@threads.h@ header, it is minimal and defined as optional.
    584 As such, library support for threading is far from widespread.
    585 At the time of writing the paper, neither \protect\lstinline@gcc@ nor \protect\lstinline@clang@ support \protect\lstinline@threads.h@ in their standard libraries.}.
    586 In modern programming languages, a lack of threading is unacceptable~\cite{Sutter05, Sutter05b}, and therefore existing and new programming languages must have tools for writing efficient concurrent programs to take advantage of parallelism.
    587 As an extension of C, \CFA needs to express these concepts in a way that is as natural as possible to programmers familiar with imperative languages.
    588 Furthermore, because C is a system-level language, programmers expect to choose precisely which features they need and which cost they are willing to pay.
    589 Hence, concurrent programs should be written using high-level mechanisms, and only step down to lower-level mechanisms when performance bottlenecks are encountered.
    590 
    591 
    592 \subsection{Coroutines: A Stepping Stone}\label{coroutine}
    593 
    594 While the focus of this discussion is concurrency and parallelism, it is important to address coroutines, which are a significant building block of a concurrency system (but not concurrent among themselves).
     614\end{comment}
     615
     616
     617\section{Coroutines: A Stepping Stone}\label{coroutine}
     618
     619Advanced controlWhile the focus of this discussion is concurrency and parallelism, it is important to address coroutines, which are a significant building block of a concurrency system (but not concurrent among themselves).
    595620Coroutines are generalized routines allowing execution to be temporarily suspended and later resumed.
    596621Hence, unlike a normal routine, a coroutine may not terminate when it returns to its caller, allowing it to be restarted with the values and execution location present at the point of suspension.
     
    10601085\end{cquote}
    10611086The combination of these two approaches allows an easy and concise specification to coroutining (and concurrency) for normal users, while more advanced users have tighter control on memory layout and initialization.
     1087
     1088
     1089\section{Concurrency}
     1090\label{s:Concurrency}
     1091
     1092At its core, concurrency is based on multiple call-stacks and scheduling threads executing on these stacks.
     1093Multiple call stacks (or contexts) and a single thread of execution, called \newterm{coroutining}~\cite{Conway63,Marlin80}, does \emph{not} imply concurrency~\cite[\S~2]{Buhr05a}.
     1094In coroutining, the single thread is self-scheduling across the stacks, so execution is deterministic, \ie the execution path from input to output is fixed and predictable.
     1095A \newterm{stackless} coroutine executes on the caller's stack~\cite{Python} but this approach is restrictive, \eg preventing modularization and supporting only iterator/generator-style programming;
     1096a \newterm{stackful} coroutine executes on its own stack, allowing full generality.
     1097Only stackful coroutines are a stepping stone to concurrency.
     1098
     1099The transition to concurrency, even for execution with a single thread and multiple stacks, occurs when coroutines also context switch to a \newterm{scheduling oracle}, introducing non-determinism from the coroutine perspective~\cite[\S~3]{Buhr05a}.
     1100Therefore, a minimal concurrency system is possible using coroutines (see Section \ref{coroutine}) in conjunction with a scheduler to decide where to context switch next.
     1101The resulting execution system now follows a cooperative threading-model, called \newterm{non-preemptive scheduling}.
     1102
     1103Because the scheduler is special, it can either be a stackless or stackful coroutine.
     1104For stackless, the scheduler performs scheduling on the stack of the current coroutine and switches directly to the next coroutine, so there is one context switch.
     1105For stackful, the current coroutine switches to the scheduler, which performs scheduling, and it then switches to the next coroutine, so there are two context switches.
     1106A stackful scheduler is often used for simplicity and security.
     1107
     1108Regardless of the approach used, a subset of concurrency related challenges start to appear.
     1109For the complete set of concurrency challenges to occur, the missing feature is \newterm{preemption}, where context switching occurs randomly between any two instructions, often based on a timer interrupt, called \newterm{preemptive scheduling}.
     1110While a scheduler introduces uncertainty in the order of execution, preemption introduces uncertainty about where context switches occur.
     1111Interestingly, uncertainty is necessary for the runtime (operating) system to give the illusion of parallelism on a single processor and increase performance on multiple processors.
     1112The reason is that only the runtime has complete knowledge about resources and how to best utilized them.
     1113However, the introduction of unrestricted non-determinism results in the need for \newterm{mutual exclusion} and \newterm{synchronization} to restrict non-determinism for correctness;
     1114otherwise, it is impossible to write meaningful programs.
     1115Optimal performance in concurrent applications is often obtained by having as much non-determinism as correctness allows.
     1116
     1117An important missing feature in C is threading\footnote{While the C11 standard defines a \protect\lstinline@threads.h@ header, it is minimal and defined as optional.
     1118As such, library support for threading is far from widespread.
     1119At the time of writing the paper, neither \protect\lstinline@gcc@ nor \protect\lstinline@clang@ support \protect\lstinline@threads.h@ in their standard libraries.}.
     1120In modern programming languages, a lack of threading is unacceptable~\cite{Sutter05, Sutter05b}, and therefore existing and new programming languages must have tools for writing efficient concurrent programs to take advantage of parallelism.
     1121As an extension of C, \CFA needs to express these concepts in a way that is as natural as possible to programmers familiar with imperative languages.
     1122Furthermore, because C is a system-level language, programmers expect to choose precisely which features they need and which cost they are willing to pay.
     1123Hence, concurrent programs should be written using high-level mechanisms, and only step down to lower-level mechanisms when performance bottlenecks are encountered.
    10621124
    10631125
  • doc/papers/concurrency/mail

    r9d9a451 r53bb8f1  
    2727
    2828Software: Practice and Experience Editorial Office
     29
     30
     31
     32Date: Wed, 3 Oct 2018 21:25:28 +0000
     33From: Richard Jones <onbehalfof@manuscriptcentral.com>
     34Reply-To: R.E.Jones@kent.ac.uk
     35To: tdelisle@uwaterloo.ca, pabuhr@uwaterloo.ca
     36Subject: Software: Practice and Experience - Decision on Manuscript ID
     37 SPE-18-0205
     38
     3903-Oct-2018
     40
     41Dear Dr Buhr,
     42
     43Many thanks for submitting SPE-18-0205 entitled "Concurrency in C∀" to Software: Practice and Experience.
     44
     45In view of the comments of the referees found at the bottom of this letter, I cannot accept your paper for publication in Software: Practice and Experience. I hope that you find the referees' very detailed comments helpful.
     46
     47Thank you for considering Software: Practice and Experience for the publication of your research.  I hope the outcome of this specific submission will not discourage you from submitting future manuscripts.
     48
     49Yours sincerely,
     50
     51
     52Prof. Richard Jones
     53Editor, Software: Practice and Experience
     54R.E.Jones@kent.ac.uk
     55
     56Referee(s)' Comments to Author:
     57
     58Reviewing: 1
     59
     60Comments to the Author
     61"Concurrency in Cforall" presents a design and implementation of a set of standard concurrency features, including coroutines, user-space and kernel-space threads, mutexes, monitors, and a scheduler, for a polymorphic derivation of C called Cforall.
     62
     63Section 2 is an overview of sequential Cforall that does not materially contribute to the paper. A brief syntax explanation where necessary in examples would be plenty.
     64
     65Section 3 begins with with an extensive discussion of concurrency that also does not materially contribute to the paper. A brief mention of whether a particular approach implements cooperative or preemptive scheduling would be sufficient. Section 3 also makes some unfortunate claims, such as C not having threads -- C does in fact define threads, and this is noted as being true in a footnote, immediately after claiming that it does not. The question remains why the C11 parallelism design is insufficient and in what way this paper proposes to augment it. While I am personally a proponent of parallel programming languages, backing the assertion that all modern languages must have threading with citations from 2005 ignores the massive popularity of modern non-parallel languages (Javascript, node.js, Typescript, Python, Ruby, etc.) and parallel languages that are not thread based, although the authors are clearly aware of such approaches.
     66
     67Sections 3.1 and 3.2 dicusses assymetric and symmetric coroutines. This also does not seem to materially contribute to a paper that is ostensibly about concurrency in a modern systems programming language. The area of coroutines, continuations, and generators is already well explored in the context of systems languages, including compilation techniques for these constructs that are more advanced than the stack instantiation model discussed in the paper.
     68
     69Section 3.3 describes threads in Cforall, briefly touching on user-space vs. kernel-space thread implementations without detailing the extensive practical differences. It is unclear how the described interface differes from C++11 threads, as the description seems to center on an RAII style approach to joining in the destructor.
     70
     71Section 4 briefly touches on a collection of well known synchronisation primitives. Again, this discussion does not materially contribute to the paper.
     72
     73Section 5 describes monitors, which are a well known and well researched technique. The Cforall implementation is unsurprising. The "multi-acquire semantics" described are not a contribution of this paper, as establishing a stable order for lock acquisition is a well known technique, one example of which is the C++ std::scoped_lock.
     74
     75Section 6 is a discussion of scheduling that does not appear to be informed by the literature. There is no discussion of work-stealing vs. work-scheduling, static vs. dynamic priorities, priority inversion, or fairness. There is a claim in secion 6.1 for a novel technique, partial signalling, that appears to be a form of dynamic priority, but no comparison is made. In section 6.6, a very brief mention of other synchronisation techniques is made, without reference to current techniques such as array-based locks, CLH or MCS queue locks, RCU and other epoch-based mechanisms, etc. Perhaps these are considered out of scope.
     76
     77Section 7 discusses parallelism, but does not materially contribute to the paper. It is claimed that preemption is necessary to implement spinning, which is not correct, since two cores can implement a spinning based approach without preemption. It is claimed that with thread pools "concurrency errors return", but no approach to removing concurrency errors with either preemptive or cooperatively scheduled user threads has been proposed in the paper that would not also apply to thread pools.
     78
     79Section 8 is intended to describe the Cforall runtime structure, but does so in a way that uses terminology in an unfamiliar way. The word cluster is more usually used in distributed systems, but here refers to a process. The term virtual processor is more usually used in hardware virtualisation, but here refers to a kernel thread. The term debug kernel is more usually used in operating systems to refer to kernels that have both debug info and a method for using a debugger in kernel space, but here refers to a debug build of a user-space process. This section does not materially contribute to the paper.
     80
     81Section 9 is intended to describe the Cforall runtime implementation. It makes some unusual claims, such as C libraries migrating to stack chaining (stack chaining was an experimental GCC feature that has been abandoned, much as it has been abandoned in both Go and Rust).
     82
     83The performance measurements in section 10 are difficult to evaluate. While I appreciate that comparable concurrency benchmarks are very difficult to write, and the corpus of existing benchmarks primarily boils down to the parallel programs in the Computer Language Benchmark Game, the lack of detail as to what is being measured in these benchmarks (particularly when implemented in other languages) is unfortunate. For example, in table 3, the benchmark appears to measure uncontended lock access, which is not a useful micro-benchmark.
     84
     85It is not clear what the contributions of this paper are intended to be. A concise listing of the intended contributions would be helpful. Currently, it appears that the paper makes neither PL contributions in terms of novel features in Cforall, nor does it make systems contributions in terms of novel features in the runtime.
     86
     87
     88Reviewing: 2
     89
     90Comments to the Author
     91This article presents the design and rationale behind the concurrency
     92features of C-forall, a new low-level programming language.  After an
     93introduction that defines a selection of standard terminology, section
     942 gives crucial background on the design of the C-forall language.
     95Section 3 then starts the core of the article, discussing the
     96language's support for "concurrency" which in this case means
     97coroutines and threads; a very brief Section 4 builds on section 3
     98with a discussion of lower level synchronizations.  Section 5 the
     99presents the main features of concurrency control in C-forall:
     100monitors and mutexes. Section 6 then extends monitors with condition
     101variables to to support scheduling, and a very brief section 7
     102discusses preemption and pooling. Section 8 discusses the runtime
     103conceptual model, section 9 gives implementation detail, and section
     10410 briefly evaluates C-forall's performance via five concurrent
     105micro benchmarks. Finally section 11 concludes the article, and then
     106section 12 presents some future work. 
     107
     108
     109At the start of section 7, article lays out its rationale: that while
     110"historically, computer performance was about processor speeds" but
     111"Now, high-performance applications must care about parallelism,
     112which requires concurrency". The doomsayers trumpeting the death of
     113Moore's law have been proved correct at last, with CPUs sequential
     114performance increasing much more slowly than the number of cores
     115within each die. This means programmers --- especially low-level,
     116systems programmers --- must somehow manage the essential complexity
     117of writing concurrent programs to run in parallel in multiple threads
     118across multiple cores. Unfortunately, the most venerable widely used
     119systems programming language, C, supports parallelism only via an
     120e.g. the threads library.  This article aims to integrate concurrent
     121programming mechanisms more closely into a novel low-level C-based
     122programming language, C-forall. The article gives an outline of much of
     123C-forall, presents a series of concurrency mechanisms, and finally
     124some microbenchmark results.  The article is detailed, comprehensive,
     125and generally well written in understandable English.
     126
     127My main concern about the article are indicated by the fact that the
     128best summary of the problem the design of concurrent C-forall sets
     129out to solve is buried more than halfway through the article in section
     1307, as above, and then the best overview of the proposed solution is
     131given in the 2nd, 4th and 5th sentence of the conclusion:
     132
     133   "The approach provides concurrency based on a preemptive M:N
     134    user-level threading-system, executing in clusters, which
     135    encapsulate scheduling of work on multiple kernel threads
     136    providing parallelism... High-level objects (monitor/task) are the
     137    core mechanism for mutual exclusion and synchronization. A novel
     138    aspect is allowing multiple mutex-objects to be accessed
     139    simultaneously reducing the potential for deadlock for this
     140    complex scenario."
     141
     142That is, in my reading of the article, it proceeds bottom up rather
     143than top down, and so my main recommendation is to essentially reverse
     144the order of the article, proceeding from the problem to be solved,
     145the high level architecture of the proposed solutions, and then going
     146down to the low-level mechanisms.  My biggest problem reading the
     147article was for explanations of why a particular decision was taken,
     148or why a particular mechanism may be used --- often this description
     149is actually later in the article, but at that point it's too late for
     150the reader.  I have tried to point out most of these places in the
     151detailed comments below.
     152
     153My second concern is that the article makes several claims that are
     154not really justified by the design or implementation in the article.
     155These include claims that this approach meets the expectations of C
     156programmers, is minimal, is implemented in itself, etc.  The article
     157doesn't generally offer evidence to support these assertions (for many
     158of them, that would require empirical studies of programmers, or at
     159least corpus studies). The solution here is to talk about motivations
     160for the design choices "we made these decisions hoping that C
     161programmers would be comfortable" rather than claims of fact "C
     162programmers are comfortable".  Again I attempt to point these out below.
     163
     164* abstract: needs to characterize the work top down, and not make
     165  claims "features respect the expectations of C programmers" that
     166  are not supported empirically.
     167
     168* p1 line 14 "integrated"
     169
     170* introduction needs to introduce the big ideas and scope of the
     171  article, not define terms.  Some of the terms / distinctions are
     172  non-standard (e.g. the distinction between "concurrency" and
     173  "parallelism") and can be avoided by using more specific terms
     174  (mutual exclusion, synchronization, parallel execution. etc).
     175
     176* to me this article introduces novel language features, not just an
     177  API.  Similarly, it doesn't talk about any additions "to the
     178  language translator" - i.e compiler changes! - rather about language
     179  features.
     180
     181
     182* section 2 lines 6-9 why buy this fight against object-orientation?
     183  this article doesn't need to make this argument, but needs to do a
     184  better job of it if it does (see other comments below)
     185
     186* sec 2.1 - are these the same as C++. IF so, say so, if not, say why
     187  not.
     188
     189* 2.2 calling it a "with statement" was confusing, given that a with
     190  clause can appear in a routine declaration with a shorthand syntax.
     191
     192* 2.3 again compare with C++ and Java (as well as Ada)
     193
     194* line 9 "as we will see in section 3"
     195
     196* 2.4 I really quite like this syntax for operators, destructors not
     197  so much.
     198
     199* 2.5 and many places elsewhere. Always first describe the semantics
     200  of your language constructs, then describe their properties, then
     201  compare with e.g. related languages (mostly C++ & Java?).  E.g in
     202  this case, something like:
     203
     204  "C-forall includes constructors, which are called to initialize
     205  newly allocated objects, and constructors, which are called when
     206  objects are deallocated. Constructors and destructors are written as
     207  functions returning void, under the special names "?{}" for
     208  constructors and "^{}" for destructors: constructors may be
     209  overridden, but destructors may not be.  The semantics of C-forall's
     210  constructors and destructors are essentially those of C++."
     211
     212  this problem repeats many times throughout the article and should be
     213  fixed everywhere.
     214
     215
     216* 2.6 again, first describe then properties then comparison.
     217   in this case, compare e.g. with C++ templates, Java/Ada generics
     218   etc.
     219
     220* why special case forward declarations? It's not 1970 any more.
     221
     222* what are traits?  structural interfaces (like Go interfaces) or
     223  nominal bindings?
     224
     225* section 3 - lines 2-30, also making very specific global definitions
     226  as in the introduction. The article does not need to take on this
     227  fight either, rather make clear that this is the conceptual model in
     228  C-forall. (If the article starts at the top and works down, that may
     229  well follow anyway).
     230
     231* "in modern programming languages... unacceptable"; "in a
     232  system-level language.. concurrent programs should be written with
     233  high-level features" - again, no need to take on these fights.
     234
     235* 3.1 onwards; I found all this "building" up hard to follow.
     236  also it's not clear a "minimal" API must separately support
     237  coroutines, threads, fibres, etc
     238
     239* FIG 2B - where's the output?
     240  syntax "sout | next(f1) | next(f2) | endl" nowhere explained
     241    why not use C++s' << and >>
     242
     243* FIG 3 be clearer, earlier about the coroutine" constructor syntax
     244
     245** ensure all figures are placed *after* their first mention in the
     246   text. consider interleaving smaller snippets of text rather than
     247   just referring to large figures
     248
     249* sec 3.1 p7 etc,. need more context / comparison e.g. Python
     250  generators etc.
     251
     252* FIGURE 4 is this right?  should there a constructor for Cons taking
     253  a Prod?
     254
     255
     256* sec 3.2 order of constructors depends on the language.  more
     257  generally, if the article is going to make arguments against OO
     258  (e.g. section 2) then the article needs to explain, in detail, why
     259  e.g. coroutine, thread, etc *cannot* be classes / objects.
     260
     261* "type coroutine_t must be an abstract handle.. descriptor and is
     262  stack are non-copyable" - too many assumptions in here (and other
     263  similar passages) that are not really spelled out in detail.
     264
     265* p10 line 4 introduces "coroutine" keyword. needs to give its
     266  semantics. also needs to introduce and define properties and compare
     267  before all the examples using coroutines.
     268
     269* p10 again, trait semantics need to be better defined
     270
     271* 3.3 should be an introduction to this section. Note that section
     272  titles are not part of the text of the article.
     273
     274* what's the difference between "coroutines" and "user threads" (and
     275  "fibres?")
     276
     277* what's a "task type" or an "interface routine"  or "underlying
     278  thread"
     279
     280* section 4 - "... meaningless". nope some semantics are possible
     281  e.g. if there's a memory model.
     282
     283* whatare "call/return based languages"
     284
     285* p12 - what if a programmer wants to join e.g. "1st of N" or "1st 3 of N"
     286  threads rather than all threads in order
     287
     288* 4.1 p12 13-25, again it's not clear where this is going.  presenting the model
     289  top down may hopefully resolve this
     290
     291* section 4 should be merged e.g. into sec 3 (or 5)
     292
     293
     294
     295* section 5 p13 what's "routine" scope. "call/return paradigm"
     296
     297* thread/ coroutine declarations, traits etc, all look pretty close to
     298  inheritance. why wouldn't inheritance work?
     299
     300* open/closed locks = free/acquired free locks?
     301
     302* testability?
     303
     304* p14 lines 14-20 I had trouble following this.  e.g/. what's the
     305  difference between "a type that is a monitor" and "a type that looks
     306  like a monitor"?  why?
     307
     308* line 39 - what's an "object-oriented monitor"?    Java?
     309    there is no one OO model of such things.
     310
     311* line 47 significant asset - how do you know?
     312
     313* how could this e.g. build a reader/writer lock
     314
     315* *p15 what's the "bank account transfer problem"
     316
     317*p16 lines6-10  why? explain?
     318
     319*p17 semantics of arrays of conditions is unclear
     320     given e.g. previous comments about arrays of mutexes.
     321
     322*p18 define "spurious wakeup"
     323
     324*p18 line 44 - "a number of approaches were examined"?  which
     325 approaches? examined by whom?  if this is a novel contribution, needs
     326 rather more there, and more comparison with related work
     327
     328* FIG 8 consider e.g. sequence diagrams rather than code to show these
     329  cases
     330
     331* 6.2 p19 line 5 "similarly, monitor routines can be added at any
     332  time" really?  I thought C-forall was compiled? there's a big
     333  difference between "static" and "dynamic" inheritance. which is this
     334  closer to?
     335
     336* line 25 "FIgure 9 (B) shows the monitor implementation"
     337   I didn't understand this, especially not as an implementation.
     338
     339* section 6.6 - if the article is to make claims about completeness,
     340  about supporting low and high level operations, then this must be
     341  expanded to give enough detail to support that argument
     342
     343* "truest realization" huh?
     344
     345* section 7 should be merged into 6 or 8.
     346  it's not clear if this is exploring rejected alternatives,
     347  out outlining different features offered by C-forall, or what.
     348
     349
     350* sec 7.2 how do the other threads in sections 5 & 6 relate to the
     351  user threads, fibres, etc here;
     352
     353* sec 8.1 I found these sections hard to follow. how is a cluster a
     354  "collection of threads and virtual processors... like a virtual
     355  machine"? Where do the thread pools from 7.3 fit in?
     356
     357*  sec 8.3 is out of place, probably unneeded in the paper
     358
     359* section 9 dives straight into details with no overview.  Section 9
     360  seems very detailed, and depends on assumptions or details that are
     361  not in the article.
     362
     363* section 10 covers only microbenchmarks. are there any moderate sized
     364  macrobenchmarks that can compare across the different systems?
     365  (e.g the Erlang Ring?)
     366
     367* sec 11 claims that "the entire C-forall runtime system are written
     368  in C-forall". The article doesn't
     369
     370
     371* future work should precede conclusion, not follow it
     372
     373* the article should have a related work section (2-3 pages) comparing
     374  the design overall with various competing designs (C++, Java, go,
     375  Rust,...)
     376
     377To encourage accountability, I'm signing my reviews in 2018. For the record, I am James Noble, kjx@ecs.vuw.ac.nz.
     378
     379Reviewing: 3
     380
     381Comments to the Author
     382This paper describes the design and implementation of coroutine- and thread-based concurrency in the C-for-all (I will write "C\/") system, a considerably extended form of the C language with many concurrency features.
     383
     384It first provides an overview of the non-concurrency-related aspects of the host language (references, operator overloading, generics, etc.), then addresses several technical issues around concurrency, including the multi-monitor design, bulk acquiring of locks (including deadlock-avoiding management of acquisition order), solutions to difficult scheduling problems around these, and implementation of monitors in the presence of separate compilation. It also presents empirical data showing the execution times of several microbenchmarks in comparison with other threaded concurrency systems, in support of the claim that the implementation is competitive with them.
     385
     386Overall the impression I gained is that this is a substantial system into which have gone much thought and effort.
     387
     388However, the present paper is not written so as to communicate sufficiently clearly the novel practices or experiences that emerged from that effort. This manifests itself in several ways.
     389
     390The system is described in general, rather than with a focus on novel insights or experiences. It was not until page 18 that I found a statement that hinted at a possible core contribution: "Supporting barging prevention as well as extending internal scheduling to multiple monitors is the main source of complexity in design and implementation of C\/ concurrency." Even then, it is unclear whether such challenges have already been surmounted in prior systems, or what other challenges the paper may also be covering. The most complete list of claims appears to be in the Conclusion (section 11; oddly not the last section), although not everything listed is a novel feature of the work (e.g. N:M threading models are an old idea). This presentation needs to be completely inverted, to focus from the outset on the claimed novel/noteworthy experiences that the work embodies.
     391
     392The text describing the system's motivation is unconvincing on one point: the claim that library support for threading in C is "far from widespread" (p5, footnote A). The pthreads library API is standardised, albeit not in the C language specification but rather in POSIX -- a widespread standard indeed. (With systems languages, even if the language does not define a feature, it of course does not follow that that feature is not available -- since such languages permit extension of their own runtime and/or toolchain.) Of course, the combination of C and pthreads does not provide close to the full complement of C\/-supported features, so it is easy to make a case for C\/'s targeted "gap in the market". But again, a presentation focused on novel aspects would bring this out and enable the reader to learn from the authors' efforts much more readily.
     393
     394Certain sections of the text read like a tutorial on concurrency... which is potentially valuable, but does not seem to belong here. For example, much effort is spent introducing the notions of "synchronization" and "mutual exclusion", including the whole of Section 4.2. Presently it is unclear how this content supports the findings/experiences that the paper is detailing.
     395
     396Similarly, section 8 reads mostly as a basic introduction to user versus kernel threading implementations (including hybrid models such as N:M scheduling), and appears superfluous to this paper. Mixed into this are details of C\/'s specific approach. These could instead be stated directly, with references to handle the unlikely case where the reader is unfamiliar.
     397
     398I also found the definitions of certain terms through the paper a bit non-standard, for unclear reasons. For example, why "condition lock" rather than the standard "condition variable" (if indeed that is what is intended)? To say that "synchronisation" is about "timing" strikes me as potentially confusing, since in truth synchronisation concerns only relative timing, i.e. ordering. (Even ordering is something of a derived concept -- since of course, most commonly, control over ordering is built atop synchronisation primitives, rather than being provided directly by them.)
     399
     400The empirical data presented is a reasonable start at characterising the implementation's performance. However, it currently suffers certain flaws.
     401
     402Firstly, it is not clear what is being claimed. The data cannot really be said to "verify the implementation" (section 10). Presumably the claim is that the system is competitive with other systems offering reasonably high-level concurrency constructs (Java monitors, Go channels, etc.) and/or on low-level facilities (mutexes, coroutines). A claim of this form, emphasising the latter, does eventually appear in the Conclusion, but it needs to be made explicitly during the presentation of the experiments. Shifting the focus towards higher-level features may be a better target, since this appears to be C\/'s main advance over pthreads and similar libraries.
     403
     404It appears some additional or alternative competitor systems might be a better match. For example, many green-thread or N:M libraries for C exist (libdill/libmill, Marcel, even GNU Pth). It would be instructive to compare with these.
     405
     406It would help greatly if the "functionally identical" benchmark code that was run on the competing systems were made available somewhere. Omitting it from the main text of the paper is understandable, since it would take too much space, but its details may still have a critical bearing on the results.
     407
     408In some cases it simply wasn't clear what is being compared. In Table 3, what are "FetchAdd + FetchSub"? I'm guessing this is some open-coded mutex using C++ atomics, but (unless I'm missing something) I cannot see an explanation in the text.
     409
     410The reports of variance (or, rather, standard deviation) are not always plausible. Is there really no observable variation in three of Table 3's cases? At the least, I would appreciate more detail on the measures taken to reduce run-time variance (e.g. disabling CPU throttling perhaps?).
     411
     412The text habitually asserts the benefits of C\/'s design without convincing argument. For example, in 2.1, do C\/'s references really reduce "syntactic noise"? I am sympathetic to the problem here, because many design trade-offs simply cannot be evaluated without very large-scale or long-term studies. However, the authors could easily refrain from extrapolating to a grand claim that cannot be substantiated. For example, instead of saying C\/ is "expressive" or "flexible" or "natural", or (say) that fork/join concurrency is "awkward and unnecessary" (p11), it would be preferable simply to give examples of the cases are captured well in the C\/ design (ideally together with any less favourable examples that illustrate the design trade-off in question) and let them speak for themselves.
     413
     414One thing I found confusing in the presentation of coroutines is that it elides the distinction between "coroutines" (i.e. their definitions) and activations thereof. It would be helpful to make this clearer, since at present this makes some claims/statements hard to understand. For example, much of 3.2 talks about "adding fields", which implies that a coroutine's activation state exists as fields in a structured object -- as, indeed, it does in C\/. This is non-obvious because in a more classical presentation of coroutines, their state would live not in "fields" but in local variables. Similarly, the text also talks about composition of "coroutines" as fields within other "coroutines", and so on, whereas if I understand correctly, these are also activations. (By later on in the text, the "C\/ style" of such constructs is clear, but not at first.)
     415
     416I was expecting a reference to Adya et al's 2002 Usenix ATC paper, on the topic of "fibers" and cooperative threading generally but also for its illustrative examples of stack ripping (maybe around "linearized code is the bane of device drivers", p7, which seems to be making a similar observation).
     417
     418Minor comments:
     419
     420The writing is rather patchy. It has many typos, and also some cases of "not meaning what is said", unclear allusions, etc.. The following is a non-exhaustive list.
     421
     422- p2 line 7: "C has a notion of objects" -- true, but this is not intended as "object" in anything like the same sense as "object-oriented", so raising it here is somewhere between confusing and meaningless.
     423
     424- lots of extraneous hyphenation e.g "inheritance-relationships", "critical-section", "mutual-exclusion", "shared-state" (as a general rule, only hyphenate noun phrases when making an adjective out of them)
     425
     426- p4 "impossible in most type systems" -- this is not a property of the "type system" as usually understood, merely the wider language design
     427
     428- p17: "release all acquired mutex types in the parameter list" should just say "release all acquired mutexes that are designated in the parameter list" (it is not "types" that are being released or acquired);
     429
     430- p19: "a class includes an exhaustive list of operations" -- except it is definitively *not* exhaustive, for the reasons given immediately afterwards. I do see the problem here, about separate compilation meaning that the space of functions using a particular type is not bounded at compile time, but that needs to be identified clearly as the problem. (Incidentally, one idea is that perhaps this mapping onto a dense space could be solved at link- or load-time, in preference to run-time indirection.)
     431
     432- p22: in 6.5, the significance of this design decision ("threads... are monitors") was still not clear to me.
     433
     434- p22: [user threads are] "the truest realization of concurrency" sounds like unnecessary editorializing (many systems can exist that can also encode all others, without necessarily giving one supremacy... e.g. actors can be used to encode shared-state concurrency).
     435
     436- p24: on line 19, the necessary feature is not "garbage collection" but precise pointer identification (which is distinct; not all GCs have it, and it has other applications besides GC)
     437
     438- p24: lines 32-39 are very dense and of unclear significance; an example, including code, would be much clearer.
     439
     440- p25: "current UNIX systems" seems to mean "Linux", so please say that or give the behaviour or some other modern Unix (I believe Solaris is somewhat different, and possibly the BSDs too). Also, in the explanation of signal dynamics, it would be useful to adopt the quotation's own terminology of "process-directed" signals. Presumably the "internal" thread-directed signals were generated using tgkill()? And presumably the timer expiry signal is left unblocked only on the thread (virtual processor) running the "simulation"? (Calling it a "simulation" is a bit odd, although I realise it is borrowing the concept of a discrete event queue.)
     441
Note: See TracChangeset for help on using the changeset viewer.