Changeset 6840e7c


Ignore:
Timestamp:
Oct 19, 2017, 12:01:04 PM (4 years ago)
Author:
Rob Schluntz <rschlunt@…>
Branches:
arm-eh, cleanup-dtors, jacob/cs343-translation, jenkins-sandbox, master, new-ast, new-ast-unique-expr
Children:
837ce06
Parents:
b96ec83 (diff), a15b72c (diff)
Note: this is a merge changeset, the changes displayed below correspond to the merge itself.
Use the (diff) links above to see all the changes relative to each parent.
Message:

Merge branch 'master' into cleanup-dtors

Files:
26 added
4 deleted
148 edited
4 moved

Legend:

Unmodified
Added
Removed
  • doc/LaTeXmacros/lstlang.sty

    rb96ec83 r6840e7c  
    22%%
    33%% Cforall Version 1.0.0 Copyright (C) 2016 University of Waterloo
    4 %% 
    5 %% lstlang.sty -- 
    6 %% 
     4%%
     5%% lstlang.sty --
     6%%
    77%% Author           : Peter A. Buhr
    88%% Created On       : Sat May 13 16:34:42 2017
     
    110110                __attribute__, auto, _Bool, catch, catchResume, choose, _Complex, __complex, __complex__,
    111111                __const, __const__, disable, dtype, enable, __extension__, fallthrough, fallthru,
    112                 finally, forall, ftype, _Generic, _Imaginary, inline, __label__, lvalue, _Noreturn, one_t, 
    113                 otype, restrict, _Static_assert, throw, throwResume, trait, try, ttype, typeof, __typeof, 
    114                 __typeof__, virtual, waitfor, when, with, zero_t},
     112                finally, forall, ftype, _Generic, _Imaginary, inline, __label__, lvalue, _Noreturn, one_t,
     113                otype, restrict, _Static_assert, throw, throwResume, trait, try, ttype, typeof, __typeof,
     114                __typeof__, virtual, with, zero_t},
    115115        morekeywords=[2]{
    116                 _Atomic, coroutine, is_coroutine, is_monitor, is_thread, monitor, mutex, nomutex,
    117                 resume, suspend, thread, _Thread_local, yield},
     116                _Atomic, coroutine, is_coroutine, is_monitor, is_thread, monitor, mutex, nomutex, or,
     117                resume, suspend, thread, _Thread_local, waitfor, when, yield},
    118118        moredirectives={defined,include_next}%
    119119}
  • doc/proposals/concurrency/Makefile

    rb96ec83 r6840e7c  
    1616text/basics \
    1717text/concurrency \
     18text/internals \
    1819text/parallelism \
     20text/together \
     21text/future \
    1922}
    2023
     
    2326        ext_monitor \
    2427        int_monitor \
     28        dependency \
    2529}}
    2630
  • doc/proposals/concurrency/annex/glossary.tex

    rb96ec83 r6840e7c  
    1313}
    1414
    15 \longnewglossaryentry{group-acquire}
    16 {name={bulk acquiring}}
     15\longnewglossaryentry{bulk-acq}
     16{name={bulk-acquiring}}
    1717{
    1818Implicitly acquiring several monitors when entering a monitor.
     19}
     20
     21\longnewglossaryentry{multi-acq}
     22{name={multiple-acquisition}}
     23{
     24Any locking technique which allow any single thread to acquire a lock multiple times.
    1925}
    2026
     
    101107\newacronym{api}{API}{Application Program Interface}
    102108\newacronym{raii}{RAII}{Ressource Acquisition Is Initialization}
     109\newacronym{numa}{NUMA}{Non-Uniform Memory Access}
  • doc/proposals/concurrency/figures/ext_monitor.fig

    rb96ec83 r6840e7c  
    14144 1 -1 0 0 0 10 0.0000 2 105 90 6000 2160 d\001
    1515-6
    16 6 5850 1650 6150 1950
    17 1 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6000 1800 105 105 6000 1800 6105 1905
    18 4 1 -1 0 0 0 10 0.0000 2 105 90 6000 1860 b\001
     166 5100 2100 5400 2400
     171 3 0 1 -1 -1 1 0 4 0.000 1 0.0000 5250 2250 105 105 5250 2250 5355 2250
     184 1 -1 0 0 0 10 0.0000 2 105 120 5250 2295 X\001
    1919-6
    20206 5100 1800 5400 2100
     
    22224 1 -1 0 0 0 10 0.0000 2 105 120 5250 2010 Y\001
    2323-6
    24 6 5100 2100 5400 2400
    25 1 3 0 1 -1 -1 1 0 4 0.000 1 0.0000 5250 2250 105 105 5250 2250 5355 2250
    26 4 1 -1 0 0 0 10 0.0000 2 105 120 5250 2295 X\001
     246 5850 1650 6150 1950
     251 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 6000 1800 105 105 6000 1800 6105 1905
     264 1 -1 0 0 0 10 0.0000 2 105 90 6000 1860 b\001
    2727-6
    28 6 3000 5400 7200 5700
     286 3070 5445 7275 5655
    29291 3 0 1 -1 -1 0 0 20 0.000 1 0.0000 3150 5550 80 80 3150 5550 3230 5630
    30301 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 4500 5550 105 105 4500 5550 4605 5655
     
    32324 0 -1 0 0 0 12 0.0000 2 135 1035 4725 5625 blocked task\001
    33334 0 -1 0 0 0 12 0.0000 2 135 870 3300 5625 active task\001
    34 4 0 -1 0 0 0 12 0.0000 2 180 930 6225 5625 routine ptrs\001
     344 0 -1 0 0 0 12 0.0000 2 135 1050 6225 5625 routine mask\001
    3535-6
    36361 3 0 1 -1 -1 0 0 -1 0.000 1 0.0000 3300 3600 105 105 3300 3600 3405 3705
     
    43432 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    4444         4050 2925 5475 2925 5475 3225 4050 3225 4050 2925
    45 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 2
    46          5850 2850 6075 3000
    47452 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 4
    4846         3150 3750 3750 3750 3750 4050 3150 4050
     
    66642 2 1 1 -1 -1 0 0 -1 4.000 0 0 0 0 0 5
    6765         5850 4200 5850 3300 4350 3300 4350 4200 5850 4200
    68 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 3
    69          5250 2850 5850 2850 5850 1650
    70 2 1 0 1 -1 -1 0 0 -1 0.000 0 0 -1 0 0 4
    71          3150 3150 3750 3150 3750 2850 5325 2850
    72662 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
    7367        1 1 1.00 60.00 120.00
    7468        7 1 1.00 60.00 120.00
    7569         5250 3150 5250 2400
     702 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
     71         3150 3150 3750 3150 3750 2850 5850 2850 5850 1650
     722 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
     73         5850 2850 6150 3000
    76742 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
    7775         5100 1800 5400 1800 5400 2400 5100 2400 5100 1800
  • doc/proposals/concurrency/style/cfa-format.tex

    rb96ec83 r6840e7c  
    108108  belowskip=3pt,
    109109  keepspaces=true,
     110  tabsize=4,
    110111  % frame=lines,
    111112  literate=,
     
    133134  belowskip=3pt,
    134135  keepspaces=true,
     136  tabsize=4,
    135137  % frame=lines,
    136138  literate=,
     
    150152  keywordstyle=\bfseries\color{blue},
    151153  keywordstyle=[2]\bfseries\color{Plum},
    152   commentstyle=\itshape\color{OliveGreen},                  % green and italic comments
     154  commentstyle=\sf\itshape\color{OliveGreen},             % green and italic comments
    153155  identifierstyle=\color{identifierCol},
    154156  stringstyle=\sf\color{Mahogany},                                % use sanserif font
     
    158160  belowskip=3pt,
    159161  keepspaces=true,
     162  tabsize=4,
    160163  % frame=lines,
    161164  literate=,
  • doc/proposals/concurrency/text/basics.tex

    rb96ec83 r6840e7c  
    11% ======================================================================
    22% ======================================================================
    3 \chapter{Basics}\label{basics}
     3\chapter{Concurrency Basics}\label{basics}
    44% ======================================================================
    55% ======================================================================
    6 Before any detailed discussion of the concurrency and parallelism in \CFA, it is important to describe the basics of concurrency and how they are expressed in \CFA user code.
     6Before any detailed discussion of the concurrency and parallelism in \CFA, it is important to describe the basics of concurrency and how they are expressed in \CFA user-code.
    77
    88\section{Basics of concurrency}
    9 At its core, concurrency is based on having call-stacks and potentially multiple threads of execution for these stacks. Concurrency without parallelism only requires having multiple call stacks (or contexts) for a single thread of execution, and switching between these call stacks on a regular basis. A minimal concurrency product can be achieved by creating coroutines, which instead of context switching between each other, always ask an oracle where to context switch next. While coroutines do not technically require a stack, stackfull coroutines are the closest abstraction to a practical "naked"" call stack. When writing concurrency in terms of coroutines, the oracle effectively becomes a scheduler and the whole system now follows a cooperative threading-model \cit. The oracle/scheduler can either be a stackless or stackfull entity and correspondingly require one or two context switches to run a different coroutine. In any case, a subset of concurrency related challenges start to appear. For the complete set of concurrency challenges to occur, the only feature missing is preemption. Indeed, concurrency challenges appear with non-determinism. Guaranteeing mutual-exclusion or synchronisation are simply ways of limiting the lack of determinism in a system. A scheduler introduces order of execution uncertainty, while preemption introduces incertainty about where context-switches occur. Now it is important to understand that uncertainty is not necessarily undesireable; uncertainty can often be used by systems to significantly increase performance and is often the basis of giving a user the illusion that tasks are running in parallel. Optimal performance in concurrent applications is often obtained by having as much non-determinism as correctness allows\cit.
     9At its core, concurrency is based on having multiple call-stacks and scheduling among threads of execution executing on these stacks. Concurrency without parallelism only requires having multiple call stacks (or contexts) for a single thread of execution.
     10
     11Indeed, while execution with a single thread and multiple stacks where the thread is self-scheduling deterministically across the stacks is called coroutining, execution with a single and multiple stacks but where the thread is scheduled by an oracle (non-deterministic from the thread perspective) across the stacks is called concurrency.
     12
     13Therefore, a minimal concurrency system can be achieved by creating coroutines, which instead of context switching among each other, always ask an oracle where to context switch next. While coroutines can execute on the caller's stack-frame, stackfull coroutines allow full generality and are sufficient as the basis for concurrency. The aforementioned oracle is a scheduler and the whole system now follows a cooperative threading-model \cit. The oracle/scheduler can either be a stackless or stackfull entity and correspondingly require one or two context switches to run a different coroutine. In any case, a subset of concurrency related challenges start to appear. For the complete set of concurrency challenges to occur, the only feature missing is preemption. Indeed, concurrency challenges appear with non-determinism. Using mutual-exclusion or synchronisation are ways of limiting the lack of determinism in a system. A scheduler introduces order of execution uncertainty, while preemption introduces uncertainty about where context-switches occur. Now it is important to understand that uncertainty is not undesireable; uncertainty can often be used by systems to significantly increase performance and is often the basis of giving a user the illusion that tasks are running in parallel. Optimal performance in concurrent applications is often obtained by having as much non-determinism as correctness allows\cit.
    1014
    1115\section{\protect\CFA 's Thread Building Blocks}
    12 One of the important features that is missing in C is threading. On modern architectures, a lack of threading is becoming less and less forgivable\cite{Sutter05, Sutter05b}, and therefore modern programming languages must have the proper tools to allow users to write performant concurrent and/or parallel programs. As an extension of C, \CFA needs to express these concepts in a way that is as natural as possible to programmers used to imperative languages. And being a system-level language means programmers expect to choose precisely which features they need and which cost they are willing to pay.
     16One of the important features that is missing in C is threading. On modern architectures, a lack of threading is unacceptable\cite{Sutter05, Sutter05b}, and therefore modern programming languages must have the proper tools to allow users to write performant concurrent and/or parallel programs. As an extension of C, \CFA needs to express these concepts in a way that is as natural as possible to programmers familiar with imperative languages. And being a system-level language means programmers expect to choose precisely which features they need and which cost they are willing to pay.
    1317
    1418\section{Coroutines: A stepping stone}\label{coroutine}
    15 While the main focus of this proposal is concurrency and parallelism, as mentionned above it is important to adress coroutines, which are actually a significant underlying aspect of a concurrency system. Indeed, while having nothing to do with parallelism and arguably little to do with concurrency, coroutines need to deal with context-switchs and other context-management operations. Therefore, this proposal includes coroutines both as an intermediate step for the implementation of threads, and a first class feature of \CFA. Furthermore, many design challenges of threads are at least partially present in designing coroutines, which makes the design effort that much more relevant. The core API of coroutines revolve around two features: independent call stacks and \code{suspend}/\code{resume}.
    16 
    17 Here is an example of a solution to the fibonnaci problem using \CFA coroutines:
    18 \begin{cfacode}
    19         coroutine Fibonacci {
    20               int fn; // used for communication
    21         };
    22 
    23         void ?{}(Fibonacci & this) { // constructor
    24               this.fn = 0;
    25         }
    26 
    27         // main automacically called on first resume
    28         void main(Fibonacci & this) {
    29                 int fn1, fn2;           // retained between resumes
    30                 this.fn = 0;
    31                 fn1 = this.fn;
    32                 suspend(this);          // return to last resume
    33 
    34                 this.fn = 1;
     19While the main focus of this proposal is concurrency and parallelism, it is important to address coroutines, which are actually a significant building block of a concurrency system. Coroutines need to deal with context-switchs and other context-management operations. Therefore, this proposal includes coroutines both as an intermediate step for the implementation of threads, and a first class feature of \CFA. Furthermore, many design challenges of threads are at least partially present in designing coroutines, which makes the design effort that much more relevant. The core \acrshort{api} of coroutines revolve around two features: independent call stacks and \code{suspend}/\code{resume}.
     20
     21A good example of a problem made easier with coroutines is genereting the fibonacci sequence. This problem comes with the challenge of decoupling how a sequence is generated and how it is used. Figure \ref{fig:fibonacci-c} shows conventional approaches to writing generators in C. All three of these approach suffer from strong coupling. The left and center approaches require that the generator have knowledge of how the sequence will be used, while the rightmost approach requires to user to hold internal state between calls on behalf of th sequence generator and makes it much harder to handle corner cases like the Fibonacci seed.
     22\begin{figure}
     23\label{fig:fibonacci-c}
     24\begin{center}
     25\begin{tabular}{c @{\hskip 0.025in}|@{\hskip 0.025in} c @{\hskip 0.025in}|@{\hskip 0.025in} c}
     26\begin{ccode}[tabsize=2]
     27//Using callbacks
     28void fibonacci_func(
     29        int n,
     30        void (*callback)(int)
     31) {
     32        int first = 0;
     33        int second = 1;
     34        int next, i;
     35        for(i = 0; i < n; i++)
     36        {
     37                if(i <= 1)
     38                        next = i;
     39                else {
     40                        next = f1 + f2;
     41                        f1 = f2;
     42                        f2 = next;
     43                }
     44                callback(next);
     45        }
     46}
     47\end{ccode}&\begin{ccode}[tabsize=2]
     48//Using output array
     49void fibonacci_array(
     50        int n,
     51        int * array
     52) {
     53        int f1 = 0; int f2 = 1;
     54        int next, i;
     55        for(i = 0; i < n; i++)
     56        {
     57                if(i <= 1)
     58                        next = i;
     59                else {
     60                        next = f1 + f2;
     61                        f1 = f2;
     62                        f2 = next;
     63                }
     64                *array = next;
     65                array++;
     66        }
     67}
     68\end{ccode}&\begin{ccode}[tabsize=2]
     69//Using external state
     70typedef struct {
     71        int f1, f2;
     72} iterator_t;
     73
     74int fibonacci_state(
     75        iterator_t * it
     76) {
     77        int f;
     78        f = it->f1 + it->f2;
     79        it->f2 = it->f1;
     80        it->f1 = f;
     81        return f;
     82}
     83
     84
     85
     86
     87
     88
     89\end{ccode}
     90\end{tabular}
     91\end{center}
     92\caption{Different implementations of a fibonacci sequence generator in C.}
     93\end{figure}
     94
     95
     96Figure \ref{fig:fibonacci-cfa} is an example of a solution to the fibonnaci problem using \CFA coroutines, using the coroutine stack to hold sufficient state for the generation. This solution has the advantage of having very strong decoupling between how the sequence is generated and how it is used. Indeed, this version is a easy to use as the \code{fibonacci_state} solution, while the imlpementation is very similar to the \code{fibonacci_func} example.
     97
     98\begin{figure}
     99\label{fig:fibonacci-cfa}
     100\begin{cfacode}
     101coroutine Fibonacci {
     102        int fn; //used for communication
     103};
     104
     105void ?{}(Fibonacci & this) { //constructor
     106        this.fn = 0;
     107}
     108
     109//main automacically called on first resume
     110void main(Fibonacci & this) {
     111        int fn1, fn2;           //retained between resumes
     112        this.fn = 0;
     113        fn1 = this.fn;
     114        suspend(this);          //return to last resume
     115
     116        this.fn = 1;
     117        fn2 = fn1;
     118        fn1 = this.fn;
     119        suspend(this);          //return to last resume
     120
     121        for ( ;; ) {
     122                this.fn = fn1 + fn2;
    35123                fn2 = fn1;
    36124                fn1 = this.fn;
    37                 suspend(this);          // return to last resume
    38 
    39                 for ( ;; ) {
    40                         this.fn = fn1 + fn2;
    41                         fn2 = fn1;
    42                         fn1 = this.fn;
    43                         suspend(this);  // return to last resume
    44                 }
    45         }
    46 
    47         int next(Fibonacci & this) {
    48                 resume(this); // transfer to last suspend
    49                 return this.fn;
    50         }
    51 
    52         void main() { // regular program main
    53                 Fibonacci f1, f2;
    54                 for ( int i = 1; i <= 10; i += 1 ) {
    55                         sout | next( f1 ) | next( f2 ) | endl;
    56                 }
    57         }
    58 \end{cfacode}
     125                suspend(this);  //return to last resume
     126        }
     127}
     128
     129int next(Fibonacci & this) {
     130        resume(this); //transfer to last suspend
     131        return this.fn;
     132}
     133
     134void main() { //regular program main
     135        Fibonacci f1, f2;
     136        for ( int i = 1; i <= 10; i += 1 ) {
     137                sout | next( f1 ) | next( f2 ) | endl;
     138        }
     139}
     140\end{cfacode}
     141\caption{Implementation of fibonacci using coroutines}
     142\end{figure}
    59143
    60144\subsection{Construction}
    61 One important design challenge for coroutines and threads (shown in section \ref{threads}) is that the runtime system needs to run code after the user-constructor runs. In the case of coroutines, this challenge is simpler since there is no non-determinism from preemption or scheduling. However, the underlying challenge remains the same for coroutines and threads.
    62 
    63 The runtime system needs to create the coroutine's stack and more importantly prepare it for the first resumption. The timing of the creation is non-trivial since users both expect to have fully constructed objects once execution enters the coroutine main and to be able to resume the coroutine from the constructor. Like for regular objects, constructors can still leak coroutines before they are ready. There are several solutions to this problem but the chosen options effectively forces the design of the coroutine.
     145One important design challenge for coroutines and threads (shown in section \ref{threads}) is that the runtime system needs to run code after the user-constructor runs to connect the object into the system. In the case of coroutines, this challenge is simpler since there is no non-determinism from preemption or scheduling. However, the underlying challenge remains the same for coroutines and threads.
     146
     147The runtime system needs to create the coroutine's stack and more importantly prepare it for the first resumption. The timing of the creation is non-trivial since users both expect to have fully constructed objects once execution enters the coroutine main and to be able to resume the coroutine from the constructor. As regular objects, constructors can leak coroutines before they are ready. There are several solutions to this problem but the chosen options effectively forces the design of the coroutine.
    64148
    65149Furthermore, \CFA faces an extra challenge as polymorphic routines create invisible thunks when casted to non-polymorphic routines and these thunks have function scope. For example, the following code, while looking benign, can run into undefined behaviour because of thunks:
     
    78162}
    79163\end{cfacode}
     164
    80165The generated C code\footnote{Code trimmed down for brevity} creates a local thunk to hold type information:
    81166
     
    95180}
    96181\end{ccode}
    97 The problem in this example is a race condition between the start of the execution of \code{noop} on the other thread and the stack frame of \code{bar} being destroyed. This extra challenge limits which solutions are viable because storing the function pointer for too long only increases the chances that the race will end in undefined behavior; i.e. the stack based thunk being destroyed before it was used. This challenge is an extension of challenges that come with second-class routines. Indeed, GCC nested routines also have the limitation that the routines cannot be passed outside of the scope of the functions these were declared in. The case of coroutines and threads is simply an extension of this problem to multiple call-stacks.
     182The problem in this example is a storage management issue, the function pointer \code{_thunk0} is only valid until the end of the block. This extra challenge limits which solutions are viable because storing the function pointer for too long causes undefined behavior; i.e. the stack based thunk being destroyed before it was used. This challenge is an extension of challenges that come with second-class routines. Indeed, GCC nested routines also have the limitation that the routines cannot be passed outside of the scope of the functions these were declared in. The case of coroutines and threads is simply an extension of this problem to multiple call-stacks.
    98183
    99184\subsection{Alternative: Composition}
    100 One solution to this challenge would be to use composition/containement,
    101 
    102 \begin{cfacode}
    103         struct Fibonacci {
    104               int fn; // used for communication
    105               coroutine c; //composition
    106         };
    107 
    108         void ?{}(Fibonacci & this) {
    109               this.fn = 0;
    110                 (this.c){};
    111         }
    112 \end{cfacode}
    113 There are two downsides to this approach. The first, which is relatively minor, is that the base class needs to be made aware of the main routine pointer, regardless of whether a parameter or a virtual pointer is used, this means the coroutine data must be made larger to store a value that is actually a compile time constant (address of the main routine). The second problem, which is both subtle and significant, is that now users can get the initialisation order of there coroutines wrong. Indeed, every field of a \CFA struct is constructed but in declaration order, unless users explicitly write otherwise. This semantics means that users who forget to initialize a the coroutine may resume the coroutine with an uninitilized object. For coroutines, this is unlikely to be a problem, for threads however, this is a significant problem.
     185One solution to this challenge is to use composition/containement, where uses add insert a coroutine field which contains the necessary information to manage the coroutine.
     186
     187\begin{cfacode}
     188struct Fibonacci {
     189        int fn; //used for communication
     190        coroutine c; //composition
     191};
     192
     193void ?{}(Fibonacci & this) {
     194        this.fn = 0;
     195        (this.c){}; //Call constructor to initialize coroutine
     196}
     197\end{cfacode}
     198There are two downsides to this approach. The first, which is relatively minor, made aware of the main routine pointer. This information must either be store in the coroutine runtime data or in its static type structure. When using composition, all coroutine handles have the same static type structure which means the pointer to the main needs to be part of the runtime data. This requirement means the coroutine data must be made larger to store a value that is actually a compile time constant (address of the main routine). The second problem, which is both subtle and significant, is that now users can get the initialisation order of coroutines wrong. Indeed, every field of a \CFA struct is constructed but in declaration order, unless users explicitly write otherwise. This semantics means that users who forget to initialize the coroutine handle may resume the coroutine with an uninitilized object. For coroutines, this is unlikely to be a problem, for threads however, this is a significant problem. Figure \ref{fig:fmt-line} shows the \code{Format} coroutine which rearranges text in order to group characters into blocks of fixed size. This is a good example where the control flow is made much simpler from being able to resume the coroutine from the constructor and highlights the idea that interesting control flow can occor in the constructor.
     199\begin{figure}
     200\label{fig:fmt-line}
     201\begin{cfacode}[tabsize=3]
     202//format characters into blocks of 4 and groups of 5 blocks per line
     203coroutine Format {
     204        char ch;                                                                        //used for communication
     205        int g, b;                                                               //global because used in destructor
     206};
     207
     208void  ?{}(Format & fmt) {
     209        resume( fmt );                                                  //prime (start) coroutine
     210}
     211
     212void ^?{}(Format & fmt) with fmt {
     213        if ( fmt.g != 0 || fmt.b != 0 )
     214        sout | endl;
     215}
     216
     217void main(Format & fmt) with fmt {
     218        for ( ;; ) {                                                    //for as many characters
     219                for(g = 0; g < 5; g++) {                //groups of 5 blocks
     220                        for(b = 0; b < 4; fb++) {       //blocks of 4 characters
     221                                suspend();
     222                                sout | ch;                                      //print character
     223                        }
     224                        sout | "  ";                                    //print block separator
     225                }
     226                sout | endl;                                            //print group separator
     227        }
     228}
     229
     230void prt(Format & fmt, char ch) {
     231        fmt.ch = ch;
     232        resume(fmt);
     233}
     234
     235int main() {
     236        Format fmt;
     237        char ch;
     238        Eof: for ( ;; ) {                                               //read until end of file
     239                sin | ch;                                                       //read one character
     240                if(eof(sin)) break Eof;                 //eof ?
     241                prt(fmt, ch);                                           //push character for formatting
     242        }
     243}
     244\end{cfacode}
     245\caption{Formatting text into lines of 5 blocks of 4 characters.}
     246\end{figure}
     247
    114248
    115249\subsection{Alternative: Reserved keyword}
     
    117251
    118252\begin{cfacode}
    119         coroutine Fibonacci {
    120               int fn; // used for communication
    121         };
    122 \end{cfacode}
    123 This mean the compiler can solve problems by injecting code where needed. The downside of this approach is that it makes coroutine a special case in the language. Users who would want to extend coroutines or build their own for various reasons can only do so in ways offered by the language. Furthermore, implementing coroutines without language supports also displays the power of \CFA.
    124 While this is ultimately the option used for idiomatic \CFA code, coroutines and threads can both be constructed by users without using the language support. The reserved keywords are only present to improve ease of use for the common cases.
     253coroutine Fibonacci {
     254        int fn; //used for communication
     255};
     256\end{cfacode}
     257This mean the compiler can solve problems by injecting code where needed. The downside of this approach is that it makes coroutine a special case in the language. Users who would want to extend coroutines or build their own for various reasons can only do so in ways offered by the language. Furthermore, implementing coroutines without language supports also displays the power of the programming language used. While this is ultimately the option used for idiomatic \CFA code, coroutines and threads can both be constructed by users without using the language support. The reserved keywords are only present to improve ease of use for the common cases.
    125258
    126259\subsection{Alternative: Lamda Objects}
     
    159292      coroutine_desc * get_coroutine(T & this);
    160293};
    161 \end{cfacode}
    162 This ensures an object is not a coroutine until \code{resume} (or \code{prime}) is called on the object. Correspondingly, any object that is passed to \code{resume} is a coroutine since it must satisfy the \code{is_coroutine} trait to compile. The advantage of this approach is that users can easily create different types of coroutines, for example, changing the memory foot print of a coroutine is trivial when implementing the \code{get_coroutine} routine. The \CFA keyword \code{coroutine} only has the effect of implementing the getter and forward declarations required for users to only have to implement the main routine.
     294
     295forall( dtype T | is_coroutine(T) ) void suspend(T &);
     296forall( dtype T | is_coroutine(T) ) void resume (T &);
     297\end{cfacode}
     298This ensures an object is not a coroutine until \code{resume} is called on the object. Correspondingly, any object that is passed to \code{resume} is a coroutine since it must satisfy the \code{is_coroutine} trait to compile. The advantage of this approach is that users can easily create different types of coroutines, for example, changing the memory layout of a coroutine is trivial when implementing the \code{get_coroutine} routine. The \CFA keyword \code{coroutine} only has the effect of implementing the getter and forward declarations required for users to only have to implement the main routine.
    163299
    164300\begin{center}
     
    186322\end{center}
    187323
    188 The combination of these two approaches allows users new to concurrency to have a easy and concise method while more advanced users can expose themselves to otherwise hidden pitfalls at the benefit of tighter control on memory layout and initialization.
     324The combination of these two approaches allows users new to coroutinning and concurrency to have an easy and concise specification, while more advanced users have tighter control on memory layout and initialization.
    189325
    190326\section{Thread Interface}\label{threads}
     
    192328
    193329\begin{cfacode}
    194         thread foo {};
     330thread foo {};
    195331\end{cfacode}
    196332
     
    205341\end{cfacode}
    206342
    207 Obviously, for this thread implementation to be usefull it must run some user code. Several other threading interfaces use a function-pointer representation as the interface of threads (for example \Csharp~\cite{Csharp} and Scala~\cite{Scala}). However, this proposal considers that statically tying a \code{main} routine to a thread superseeds this approach. Since the \code{main} routine is already a special routine in \CFA (where the program begins), it is possible naturally extend the semantics using overloading to declare mains for different threads (the normal main being the main of the initial thread). As such the \code{main} routine of a thread can be defined as
    208 \begin{cfacode}
    209         thread foo {};
    210 
    211         void main(foo & this) {
    212                 sout | "Hello World!" | endl;
    213         }
    214 \end{cfacode}
    215 
    216 In this example, threads of type \code{foo} start execution in the \code{void main(foo*)} routine which prints \code{"Hello World!"}. While this proposoal encourages this approach to enforce strongly-typed programming, users may prefer to use the routine based thread semantics for the sake of simplicity. With these semantics it is trivial to write a thread type that takes a function pointer as parameter and executes it on its stack asynchronously
    217 \begin{cfacode}
    218         typedef void (*voidFunc)(void);
    219 
    220         thread FuncRunner {
    221                 voidFunc func;
    222         };
    223 
    224         //ctor
    225         void ?{}(FuncRunner & this, voidFunc inFunc) {
    226                 this.func = inFunc;
    227         }
    228 
    229         //main
    230         void main(FuncRunner & this) {
    231                 this.func();
    232         }
    233 \end{cfacode}
    234 
    235 An advantage of the overloading approach to main is to clearly highlight where and what memory is required to pass parameters and return values to/from a thread.
    236 
    237 Of course for threads to be useful, it must be possible to start and stop threads and wait for them to complete execution. While using an \acrshort{api} such as \code{fork} and \code{join} is relatively common in the literature, such an interface is unnecessary. Indeed, the simplest approach is to use \acrshort{raii} principles and have threads \code{fork} once the constructor has completed and \code{join} before the destructor runs.
     343Obviously, for this thread implementation to be usefull it must run some user code. Several other threading interfaces use a function-pointer representation as the interface of threads (for example \Csharp~\cite{Csharp} and Scala~\cite{Scala}). However, this proposal considers that statically tying a \code{main} routine to a thread superseeds this approach. Since the \code{main} routine is already a special routine in \CFA (where the program begins), it is a natural extension of the semantics using overloading to declare mains for different threads (the normal main being the main of the initial thread). As such the \code{main} routine of a thread can be defined as
     344\begin{cfacode}
     345thread foo {};
     346
     347void main(foo & this) {
     348        sout | "Hello World!" | endl;
     349}
     350\end{cfacode}
     351
     352In this example, threads of type \code{foo} start execution in the \code{void main(foo &)} routine, which prints \code{"Hello World!"}. While this thesis encourages this approach to enforce strongly-typed programming, users may prefer to use the routine-based thread semantics for the sake of simplicity. With these semantics it is trivial to write a thread type that takes a function pointer as a parameter and executes it on its stack asynchronously
     353\begin{cfacode}
     354typedef void (*voidFunc)(int);
     355
     356thread FuncRunner {
     357        voidFunc func;
     358        int arg;
     359};
     360
     361void ?{}(FuncRunner & this, voidFunc inFunc, int arg) {
     362        this.func = inFunc;
     363}
     364
     365void main(FuncRunner & this) {
     366        this.func( this.arg );
     367}
     368\end{cfacode}
     369
     370An consequence of the strongly typed approach to main is that memory layout of parameters and return values to/from a thread are now explicitly specified in the \acrshort{api}.
     371
     372Of course for threads to be useful, it must be possible to start and stop threads and wait for them to complete execution. While using an \acrshort{api} such as \code{fork} and \code{join} is relatively common in the literature, such an interface is unnecessary. Indeed, the simplest approach is to use \acrshort{raii} principles and have threads \code{fork} after the constructor has completed and \code{join} before the destructor runs.
    238373\begin{cfacode}
    239374thread World;
     
    254389\end{cfacode}
    255390
    256 This semantic has several advantages over explicit semantics typesafety is guaranteed, a thread is always started and stopped exaclty once and users cannot make any progamming errors. Another advantage of this semantic is that it naturally scale to multiple threads meaning basic synchronisation is very simple
     391This semantic has several advantages over explicit semantics: a thread is always started and stopped exaclty once and users cannot make any progamming errors and it naturally scales to multiple threads meaning basic synchronisation is very simple
    257392
    258393\begin{cfacode}
     
    276411\end{cfacode}
    277412
    278 However, one of the apparent drawbacks of this system is that threads now always form a lattice, that is they are always destroyed in opposite order of construction because of block structure. However, storage allocation is not limited to blocks; dynamic allocation can create threads that outlive the scope in which the thread is created much like dynamically allocating memory lets objects outlive the scope in which they are created
     413However, one of the drawbacks of this approach is that threads now always form a lattice, that is they are always destroyed in opposite order of construction because of block structure. This restriction is relaxed by using dynamic allocation, so threads can outlive the scope in which they are created, much like dynamically allocating memory lets objects outlive the scope in which they are created
    279414
    280415\begin{cfacode}
     
    283418};
    284419
    285 //main
    286420void main(MyThread & this) {
    287421        //...
     
    291425        MyThread * long_lived;
    292426        {
     427                //Start a thread at the beginning of the scope
    293428                MyThread short_lived;
    294                 //Start a thread at the beginning of the scope
    295 
    296                 DoStuff();
    297429
    298430                //create another thread that will outlive the thread in this scope
    299431                long_lived = new MyThread;
    300432
     433                DoStuff();
     434
    301435                //Wait for the thread short_lived to finish
    302436        }
    303437        DoMoreStuff();
    304438
    305         //Now wait for the short_lived to finish
     439        //Now wait for the long_lived to finish
    306440        delete long_lived;
    307441}
  • doc/proposals/concurrency/text/cforall.tex

    rb96ec83 r6840e7c  
    55% ======================================================================
    66
    7 As mentionned in the introduction, the document presents the design for the concurrency features in \CFA. Since it is a new language here is a quick review of the language specifically tailored to the features needed to support concurrency.
     7This thesis presents the design for a set of concurrency features in \CFA. Since it is a new dialect of C, the following is a quick introduction to the language, specifically tailored to the features needed to support concurrency.
    88
    9 \CFA is a extension of ISO C and therefore supports much of the same paradigms as C. It is a non-object oriented system level language, meaning it has very most of the major abstractions have either no runtime cost or can be opt-out easily. Like C, the basics of \CFA revolve around structures and routines, which are thin abstractions over assembly. The vast majority of the code produced by a \CFA compiler respects memory-layouts and calling-conventions laid out by C. However, while \CFA is not an object-oriented language according to a strict definition. It does have some notion of objects, most importantly construction and destruction of objects. Most of the following pieces of code can be found as is on the \CFA website : \cite{www-cfa}
     9\CFA is a extension of ISO-C and therefore supports all of the same paradigms as C. It is a non-object oriented system language, meaning most of the major abstractions have either no runtime overhead or can be opt-out easily. Like C, the basics of \CFA revolve around structures and routines, which are thin abstractions over machine code. The vast majority of the code produced by the \CFA translator respects memory-layouts and calling-conventions laid out by C. Interestingly, while \CFA is not an object-oriented language, lacking the concept of a received (e.g.: this), it does have some notion of objects\footnote{C defines the term objects as : [Where to I get the C11 reference manual?]}, most importantly construction and destruction of objects. Most of the following pieces of code can be found on the \CFA website \cite{www-cfa}
    1010
    1111\section{References}
    1212
    13 Like \CC, \CFA introduces references as an alternative to pointers. In regards to concurrency, the semantics difference between pointers and references aren't particularly relevant but since this document uses mostly references here is a quick overview of the semantics :
     13Like \CC, \CFA introduces references as an alternative to pointers. In regards to concurrency, the semantics difference between pointers and references are not particularly relevant but since this document uses mostly references here is a quick overview of the semantics :
    1414\begin{cfacode}
    1515int x, *p1 = &x, **p2 = &p1, ***p3 = &p2,
    1616&r1 = x,    &&r2 = r1,   &&&r3 = r2;
    17 ***p3 = 3;                                      // change x
    18 r3 = 3;                                         // change x, ***r3
    19 **p3 = ...;                                     // change p1
    20 &r3 = ...;                                      // change r1, (&*)**r3
    21 *p3 = ...;                                      // change p2
    22 &&r3 = ...;                                     // change r2, (&(&*)*)*r3
    23 &&&r3 = p3;                                     // change r3 to p3, (&(&(&*)*)*)r3
    24 int y, z, & ar[3] = { x, y, z };                // initialize array of references
    25 &ar[1] = &z;                                    // change reference array element
    26 typeof( ar[1] ) p;                              // is int, i.e., the type of referenced object
    27 typeof( &ar[1] ) q;                             // is int &, i.e., the type of reference
    28 sizeof( ar[1] ) == sizeof( int );               // is true, i.e., the size of referenced object
    29 sizeof( &ar[1] ) == sizeof( int *);             // is true, i.e., the size of a reference
     17***p3 = 3;                                                      //change x
     18r3    = 3;                                                      //change x, ***r3
     19**p3  = ...;                                            //change p1
     20*p3   = ...;                                            //change p2
     21int y, z, & ar[3] = {x, y, z};          //initialize array of references
     22typeof( ar[1]) p;                                       //is int, i.e., the type of referenced object
     23typeof(&ar[1]) q;                                       //is int &, i.e., the type of reference
     24sizeof( ar[1]) == sizeof(int);          //is true, i.e., the size of referenced object
     25sizeof(&ar[1]) == sizeof(int *);        //is true, i.e., the size of a reference
    3026\end{cfacode}
    3127The important thing to take away from this code snippet is that references offer a handle to an object much like pointers but which is automatically derefferenced when convinient.
     
    3329\section{Overloading}
    3430
    35 Another important feature \CFA has in common with \CC is function overloading :
     31Another important feature of \CFA is function overloading as in Java and \CC, where routine with the same name are selected based on the numbers and type of the arguments. As well, \CFA uses the return type as part of the selection criteria, as in Ada\cite{Ada}. For routines with multiple parameters and returns, the selection is complex.
    3632\begin{cfacode}
    37 // selection based on type and number of parameters
    38 void f( void );                                 // (1)
    39 void f( char );                                 // (2)
    40 void f( int, double );                          // (3)
    41 f();                                            // select (1)
    42 f( 'a' );                                       // select (2)
    43 f( 3, 5.2 );                                    // select (3)
     33//selection based on type and number of parameters
     34void f(void);                   //(1)
     35void f(char);                   //(2)
     36void f(int, double);    //(3)
     37f();                                    //select (1)
     38f('a');                                 //select (2)
     39f(3, 5.2);                              //select (3)
    4440
    45 // selection based on  type and number of returns
    46 char f( int );                                  // (1)
    47 double f( int );                                // (2)
    48 [ int, double ] f( int );                       // (3)
    49 char c = f( 3 );                                // select (1)
    50 double d = f( 4 );                              // select (2)
    51 [ int, double ] t = f( 5 );                     // select (3)
     41//selection based on  type and number of returns
     42char   f(int);                  //(1)
     43double f(int);                  //(2)
     44char   c = f(3);                //select (1)
     45double d = f(4);                //select (2)
    5246\end{cfacode}
    53 This feature is particularly important for concurrency since the runtime system relies on creating different types do represent concurrency objects. Therefore, overloading is necessary to prevent the need for long prefixes and other naming conventions that prevent clashes. As seen in chapter \ref{basics}, the main is an example of routine that benefits from overloading when concurrency in introduced.
     47This feature is particularly important for concurrency since the runtime system relies on creating different types to represent concurrency objects. Therefore, overloading is necessary to prevent the need for long prefixes and other naming conventions that prevent name clashes. As seen in chapter \ref{basics}, routines main is an example that benefits from overloading.
    5448
    5549\section{Operators}
    5650Overloading also extends to operators. The syntax for denoting operator-overloading is to name a routine with the symbol of the operator and question marks where the arguments of the operation would be, like so :
    5751\begin{cfacode}
    58 int ++?( int op );                              // unary prefix increment
    59 int ?++( int op );                              // unary postfix increment
    60 int ?+?( int op1, int op2 );                    // binary plus
    61 int ?<=?( int op1, int op2 );                   // binary less than
    62 int ?=?( int & op1, int op2 );                  // binary assignment
    63 int ?+=?( int & op1, int op2 );                 // binary plus-assignment
     52int ++? (int op);                       //unary prefix increment
     53int ?++ (int op);                       //unary postfix increment
     54int ?+? (int op1, int op2);             //binary plus
     55int ?<=?(int op1, int op2);             //binary less than
     56int ?=? (int & op1, int op2);           //binary assignment
     57int ?+=?(int & op1, int op2);           //binary plus-assignment
    6458
    65 struct S { int i, j; };
    66 S ?+?( S op1, S op2 ) {                         // add two structures
    67         return (S){ op1.i + op2.i, op1.j + op2.j };
     59struct S {int i, j;};
     60S ?+?(S op1, S op2) {                           //add two structures
     61        return (S){op1.i + op2.i, op1.j + op2.j};
    6862}
    69 S s1 = { 1, 2 }, s2 = { 2, 3 }, s3;
    70 s3 = s1 + s2;                                   // compute sum: s3 == { 2, 5 }
     63S s1 = {1, 2}, s2 = {2, 3}, s3;
     64s3 = s1 + s2;                                           //compute sum: s3 == {2, 5}
    7165\end{cfacode}
    72 
    73 Since concurrency does not use operator overloading, this feature is more important as an introduction for the syntax of constructors.
     66While concurrency does not use operator overloading directly, this feature is more important as an introduction for the syntax of constructors.
    7467
    7568\section{Constructors/Destructors}
    76 \CFA uses the following syntax for constructors and destructors :
     69Object life-time is often a challenge in concurrency. \CFA uses the approach of giving concurrent meaning to object life-time as a mean of synchronization and/or mutual exclusion. Since \CFA relies heavily on the life time of objects, constructors and destructors are a core feature required for concurrency and parallelism. \CFA uses the following syntax for constructors and destructors :
    7770\begin{cfacode}
    7871struct S {
     
    8073        int * ia;
    8174};
    82 void ?{}( S & s, int asize ) with s {           // constructor operator
    83         size = asize;                           // initialize fields
    84         ia = calloc( size, sizeof( S ) );
     75void ?{}(S & s, int asize) {    //constructor operator
     76        s.size = asize;                         //initialize fields
     77        s.ia = calloc(size, sizeof(S));
    8578}
    86 void ^?{}( S & s ) with s {                     // destructor operator
    87         free( ia );                             // de-initialization fields
     79void ^?{}(S & s) {                              //destructor operator
     80        free(ia);                                       //de-initialization fields
    8881}
    8982int main() {
    90         S x = { 10 }, y = { 100 };              // implict calls: ?{}( x, 10 ), ?{}( y, 100 )
    91         ...                                     // use x and y
    92         ^x{};  ^y{};                            // explicit calls to de-initialize
    93         x{ 20 };  y{ 200 };                     // explicit calls to reinitialize
    94         ...                                     // reuse x and y
    95 }                                               // implict calls: ^?{}( y ), ^?{}( x )
     83        S x = {10}, y = {100};          //implict calls: ?{}(x, 10), ?{}(y, 100)
     84        ...                                                     //use x and y
     85        ^x{};  ^y{};                            //explicit calls to de-initialize
     86        x{20};  y{200};                         //explicit calls to reinitialize
     87        ...                                                     //reuse x and y
     88}                                                               //implict calls: ^?{}(y), ^?{}(x)
    9689\end{cfacode}
    97 The language guarantees that every object and all their fields are constructed. Like \CC construction is automatically done on declaration and destruction done when the declared variables reach the end of its scope.
     90The language guarantees that every object and all their fields are constructed. Like \CC, construction of an object is automatically done on allocation and destruction of the object is done on deallocation. Allocation and deallocation can occur on the stack or on the heap.
     91\begin{cfacode}
     92{
     93        struct S s = {10};      //allocation, call constructor
     94        ...
     95}                                               //deallocation, call destructor
     96struct S * s = new();   //allocation, call constructor
     97...
     98delete(s);                              //deallocation, call destructor
     99\end{cfacode}
     100Note that like \CC, \CFA introduces \code{new} and \code{delete}, which behave like \code{malloc} and \code{free} in addition to constructing and destructing objects, after calling \code{malloc} and before calling \code{free} respectively.
    98101
    99 For more information see \cite{cforall-ug,rob-thesis,www-cfa}.
     102\section{Parametric Polymorphism}
     103Routines in \CFA can also be reused for multiple types. This is done using the \code{forall} clause which gives \CFA it's name. \code{forall} clauses allow seperatly compiled routines to support generic usage over multiple types. For example, the following sum function will work for any type which support construction from 0 and addition :
     104\begin{cfacode}
     105//constraint type, 0 and +
     106forall(otype T | { void ?{}(T *, zero_t); T ?+?(T, T); })
     107T sum(T a[ ], size_t size) {
     108        T total = 0;                            //construct T from 0
     109        for(size_t i = 0; i < size; i++)
     110                total = total + a[i];   //select appropriate +
     111        return total;
     112}
     113
     114S sa[5];
     115int i = sum(sa, 5);                             //use S's 0 construction and +
     116\end{cfacode}
     117
     118Since writing constraints on types can become cumbersome for more constrained functions, \CFA also has the concept of traits. Traits are named collection of constraints which can be used both instead and in addition to regular constraints:
     119\begin{cfacode}
     120trait sumable( otype T ) {
     121        void ?{}(T *, zero_t);          //constructor from 0 literal
     122        T ?+?(T, T);                            //assortment of additions
     123        T ?+=?(T *, T);
     124        T ++?(T *);
     125        T ?++(T *);
     126};
     127forall( otype T | sumable(T) )  //use trait
     128T sum(T a[], size_t size);
     129\end{cfacode}
     130
     131\section{with Clause/Statement}
     132Since \CFA lacks the concept of a receiver, certain functions end-up needing to repeat variable names often, to solve this \CFA offers the \code{with} statement which opens an aggregate scope making its fields directly accessible (like Pascal).
     133\begin{cfacode}
     134struct S { int i, j; };
     135int mem(S & this) with this             //with clause
     136        i = 1;                                          //this->i
     137        j = 2;                                          //this->j
     138}
     139int foo() {
     140        struct S1 { ... } s1;
     141        struct S2 { ... } s2;
     142        with s1                                         //with statement
     143        {
     144                //access fields of s1
     145                //without qualification
     146                with s2                                 //nesting
     147                {
     148                        //access fields of s1 and s2
     149                        //without qualification
     150                }
     151        }
     152        with s1, s2                             //scopes open in parallel
     153        {
     154                //access fields of s1 and s2
     155                //without qualification
     156        }
     157}
     158\end{cfacode}
     159
     160For more information on \CFA see \cite{cforall-ug,rob-thesis,www-cfa}.
  • doc/proposals/concurrency/text/concurrency.tex

    rb96ec83 r6840e7c  
    44% ======================================================================
    55% ======================================================================
    6 Several tool can be used to solve concurrency challenges. Since many of these challenges appear with the use of mutable shared-state, some languages and libraries simply disallow mutable shared-state (Erlang~\cite{Erlang}, Haskell~\cite{Haskell}, Akka (Scala)~\cite{Akka}). In these paradigms, interaction among concurrent objects relies on message passing~\cite{Thoth,Harmony,V-Kernel} or other paradigms that closely relate to networking concepts (channels\cit for example). However, in languages that use routine calls as their core abstraction-mechanism, these approaches force a clear distinction between concurrent and non-concurrent paradigms (i.e., message passing versus routine call). This distinction in turn means that, in order to be effective, programmers need to learn two sets of designs patterns. While this distinction can be hidden away in library code, effective use of the librairy still has to take both paradigms into account.
     6Several tool can be used to solve concurrency challenges. Since many of these challenges appear with the use of mutable shared-state, some languages and libraries simply disallow mutable shared-state (Erlang~\cite{Erlang}, Haskell~\cite{Haskell}, Akka (Scala)~\cite{Akka}). In these paradigms, interaction among concurrent objects relies on message passing~\cite{Thoth,Harmony,V-Kernel} or other paradigms closely relate to networking concepts (channels\cit for example). However, in languages that use routine calls as their core abstraction-mechanism, these approaches force a clear distinction between concurrent and non-concurrent paradigms (i.e., message passing versus routine call). This distinction in turn means that, in order to be effective, programmers need to learn two sets of designs patterns. While this distinction can be hidden away in library code, effective use of the librairy still has to take both paradigms into account.
    77
    88Approaches based on shared memory are more closely related to non-concurrent paradigms since they often rely on basic constructs like routine calls and shared objects. At the lowest level, concurrent paradigms are implemented as atomic operations and locks. Many such mechanisms have been proposed, including semaphores~\cite{Dijkstra68b} and path expressions~\cite{Campbell74}. However, for productivity reasons it is desireable to have a higher-level construct be the core concurrency paradigm~\cite{HPP:Study}.
    99
    10 An approach that is worth mentionning because it is gaining in popularity is transactionnal memory~\cite{Dice10}[Check citation]. While this approach is even pursued by system languages like \CC\cit, the performance and feature set is currently too restrictive to be the main concurrency paradigm for general purpose language, which is why it was rejected as the core paradigm for concurrency in \CFA.
    11 
    12 One of the most natural, elegant, and efficient mechanisms for synchronization and communication, especially for shared memory systems, is the \emph{monitor}. Monitors were first proposed by Brinch Hansen~\cite{Hansen73} and later described and extended by C.A.R.~Hoare~\cite{Hoare74}. Many programming languages---e.g., Concurrent Pascal~\cite{ConcurrentPascal}, Mesa~\cite{Mesa}, Modula~\cite{Modula-2}, Turing~\cite{Turing:old}, Modula-3~\cite{Modula-3}, NeWS~\cite{NeWS}, Emerald~\cite{Emerald}, \uC~\cite{Buhr92a} and Java~\cite{Java}---provide monitors as explicit language constructs. In addition, operating-system kernels and device drivers have a monitor-like structure, although they often use lower-level primitives such as semaphores or locks to simulate monitors. For these reasons, this project proposes monitors as the core concurrency-construct.
     10An approach that is worth mentionning because it is gaining in popularity is transactionnal memory~\cite{Dice10}[Check citation]. While this approach is even pursued by system languages like \CC\cit, the performance and feature set is currently too restrictive to be the main concurrency paradigm for systems language, which is why it was rejected as the core paradigm for concurrency in \CFA.
     11
     12One of the most natural, elegant, and efficient mechanisms for synchronization and communication, especially for shared-memory systems, is the \emph{monitor}. Monitors were first proposed by Brinch Hansen~\cite{Hansen73} and later described and extended by C.A.R.~Hoare~\cite{Hoare74}. Many programming languages---e.g., Concurrent Pascal~\cite{ConcurrentPascal}, Mesa~\cite{Mesa}, Modula~\cite{Modula-2}, Turing~\cite{Turing:old}, Modula-3~\cite{Modula-3}, NeWS~\cite{NeWS}, Emerald~\cite{Emerald}, \uC~\cite{Buhr92a} and Java~\cite{Java}---provide monitors as explicit language constructs. In addition, operating-system kernels and device drivers have a monitor-like structure, although they often use lower-level primitives such as semaphores or locks to simulate monitors. For these reasons, this project proposes monitors as the core concurrency-construct.
    1313
    1414\section{Basics}
    15 Non-determinism requires concurrent systems to offer support for mutual-exclusion and synchronisation. Mutual-exclusion is the concept that only a fixed number of threads can access a critical section at any given time, where a critical section is a group of instructions on an associated portion of data that requires the restricted access. On the other hand, synchronization enforces relative ordering of execution and synchronization tools numerous mechanisms to establish timing relationships among threads.
     15Non-determinism requires concurrent systems to offer support for mutual-exclusion and synchronisation. Mutual-exclusion is the concept that only a fixed number of threads can access a critical section at any given time, where a critical section is a group of instructions on an associated portion of data that requires the restricted access. On the other hand, synchronization enforces relative ordering of execution and synchronization tools provide numerous mechanisms to establish timing relationships among threads.
    1616
    1717\subsection{Mutual-Exclusion}
    18 As mentionned above, mutual-exclusion is the guarantee that only a fix number of threads can enter a critical section at once. However, many solution exists for mutual exclusion which vary in terms of performance, flexibility and ease of use. Methods range from low-level locks, which are fast and flexible but require significant attention to be correct, to  higher-level mutual-exclusion methods, which sacrifice some performance in order to improve ease of use. Ease of use comes by either guaranteeing some problems cannot occur (e.g., being deadlock free) or by offering a more explicit coupling between data and corresponding critical section. For example, the \CC \code{std::atomic<T>} which offer an easy way to express mutual-exclusion on a restricted set of operations (.e.g: reading/writing large types atomically). Another challenge with low-level locks is composability. Locks are not composable because it takes careful organising for multiple locks to be used while preventing deadlocks. Easing composability is another feature higher-level mutual-exclusion mechanisms often offer.
     18As mentionned above, mutual-exclusion is the guarantee that only a fix number of threads can enter a critical section at once. However, many solutions exist for mutual exclusion, which vary in terms of performance, flexibility and ease of use. Methods range from low-level locks, which are fast and flexible but require significant attention to be correct, to  higher-level mutual-exclusion methods, which sacrifice some performance in order to improve ease of use. Ease of use comes by either guaranteeing some problems cannot occur (e.g., being deadlock free) or by offering a more explicit coupling between data and corresponding critical section. For example, the \CC \code{std::atomic<T>} offers an easy way to express mutual-exclusion on a restricted set of operations (e.g.: reading/writing large types atomically). Another challenge with low-level locks is composability. Locks have restricted composability because it takes careful organising for multiple locks to be used while preventing deadlocks. Easing composability is another feature higher-level mutual-exclusion mechanisms often offer.
    1919
    2020\subsection{Synchronization}
    21 As for mutual-exclusion, low level synchronisation primitive often offer good performance and good flexibility at the cost of ease of use. Again, higher-level mechanism often simplify usage by adding better coupling between synchronization and data, .eg., message passing, or offering simple solution to otherwise involved challenges. An example of this is barging. As mentionned above synchronization can be expressed as guaranteeing that event \textit{X} always happens before \textit{Y}. Most of the time synchronisation happens around a critical section, where threads most acquire said critical section in a certain order. However, it may also be desired to be able to guarantee that event \textit{Z} does not occur between \textit{X} and \textit{Y}. This is called barging, where event \textit{X} tries to effect event \textit{Y} but anoter thread races to grab the critical section and emits \textit{Z} before \textit{Y}. Preventing or detecting barging is an involved challenge with low-level locks, which can be made much easier by higher-level constructs.
     21As for mutual-exclusion, low-level synchronisation primitives often offer good performance and good flexibility at the cost of ease of use. Again, higher-level mechanism often simplify usage by adding better coupling between synchronization and data, e.g.: message passing, or offering simple solution to otherwise involved challenges. An example is barging. As mentioned above, synchronization can be expressed as guaranteeing that event \textit{X} always happens before \textit{Y}. Most of the time, synchronisation happens around a critical section, where threads must acquire critical sections in a certain order. However, it may also be desirable to guarantee that event \textit{Z} does not occur between \textit{X} and \textit{Y}. Not satisfying this property called barging. For example, where event \textit{X} tries to effect event \textit{Y} but another thread acquires the critical section and emits \textit{Z} before \textit{Y}. Preventing or detecting barging is an involved challenge with low-level locks, which can be made much easier by higher-level constructs. This challenge is often split into two different methods, barging avoidance and barging prevention. Algorithms that use status flags and other flag variables to detect barging threads are said to be using barging avoidance while algorithms that baton-passing locks between threads instead of releasing the locks are said to be using barging prevention.
    2222
    2323% ======================================================================
     
    2828A monitor is a set of routines that ensure mutual exclusion when accessing shared state. This concept is generally associated with Object-Oriented Languages like Java~\cite{Java} or \uC~\cite{uC++book} but does not strictly require OO semantics. The only requirements is the ability to declare a handle to a shared object and a set of routines that act on it :
    2929\begin{cfacode}
    30         typedef /*some monitor type*/ monitor;
    31         int f(monitor & m);
    32 
    33         int main() {
    34                 monitor m;  //Handle m
    35                 f(m);       //Routine using handle
    36         }
     30typedef /*some monitor type*/ monitor;
     31int f(monitor & m);
     32
     33int main() {
     34        monitor m;  //Handle m
     35        f(m);       //Routine using handle
     36}
    3737\end{cfacode}
    3838
     
    4747
    4848\begin{cfacode}
    49         monitor counter_t { /*...see section $\ref{data}$...*/ };
    50 
    51         void ?{}(counter_t & nomutex this); //constructor
    52         size_t ++?(counter_t & mutex this); //increment
    53 
    54         //need for mutex is platform dependent
    55         void ?{}(size_t * this, counter_t & mutex cnt); //conversion
    56 \end{cfacode}
    57 
    58 Here, the constructor(\code{?\{\}}) uses the \code{nomutex} keyword to signify that it does not acquire the monitor mutual-exclusion when constructing. This semantics is because an object not yet constructed should never be shared and therefore does not require mutual exclusion. The prefix increment operator uses \code{mutex} to protect the incrementing process from race conditions. Finally, there is a conversion operator from \code{counter_t} to \code{size_t}. This conversion may or may not require the \code{mutex} keyword depending on whether or not reading an \code{size_t} is an atomic operation.
    59 
    60 Having both \code{mutex} and \code{nomutex} keywords is redundant based on the meaning of a routine having neither of these keywords. For example, given a routine without qualifiers \code{void foo(counter_t & this)}, then it is reasonable that it should default to the safest option \code{mutex}, whereas assuming \code{nomutex} is unsafe and may cause subtle errors. In fact, \code{nomutex} is the "normal" parameter behaviour, with the \code{nomutex} keyword effectively stating explicitly that "this routine is not special". Another alternative is to make having exactly one of these keywords mandatory, which would provide the same semantics but without the ambiguity of supporting routines neither keyword. Mandatory keywords would also have the added benefit of being self-documented but at the cost of extra typing. While there are several benefits to mandatory keywords, they do bring a few challenges. Mandatory keywords in \CFA would imply that the compiler must know without a doubt wheter or not a parameter is a monitor or not. Since \CFA relies heavily on traits as an abstraction mechanism, the distinction between a type that is a monitor and a type that looks like a monitor can become blurred. For this reason, \CFA only has the \code{mutex} keyword.
    61 
    62 
    63 The next semantic decision is to establish when \code{mutex} may be used as a type qualifier. Consider the following declarations:
    64 \begin{cfacode}
    65 int f1(monitor & mutex m);
    66 int f2(const monitor & mutex m);
    67 int f3(monitor ** mutex m);
    68 int f4(monitor * mutex m []);
    69 int f5(graph(monitor*) & mutex m);
    70 \end{cfacode}
    71 The problem is to indentify which object(s) should be acquired. Furthermore, each object needs to be acquired only once. In the case of simple routines like \code{f1} and \code{f2} it is easy to identify an exhaustive list of objects to acquire on entry. Adding indirections (\code{f3}) still allows the compiler and programmer to indentify which object is acquired. However, adding in arrays (\code{f4}) makes it much harder. Array lengths are not necessarily known in C, and even then making sure objects are only acquired once becomes none-trivial. This can be extended to absurd limits like \code{f5}, which uses a graph of monitors. To keep everyone as sane as possible~\cite{Chicken}, this projects imposes the requirement that a routine may only acquire one monitor per parameter and it must be the type of the parameter with one level of indirection (ignoring potential qualifiers). Also note that while routine \code{f3} can be supported, meaning that monitor \code{**m} is be acquired, passing an array to this routine would be type safe and yet result in undefined behavior because only the first element of the array is acquired. This is specially true for non-copyable objects like monitors, where an array of pointers is simplest way to express a group of monitors. However, this ambiguity is part of the C type-system with respects to arrays. For this reason, \code{mutex} is disallowed in the context where arrays may be passed:
    72 
    73 \begin{cfacode}
    74 int f1(monitor & mutex m);   //Okay : recommanded case
    75 int f2(monitor * mutex m);   //Okay : could be an array but probably not
    76 int f3(monitor mutex m []);  //Not Okay : Array of unkown length
    77 int f4(monitor ** mutex m);  //Not Okay : Could be an array
    78 int f5(monitor * mutex m []); //Not Okay : Array of unkown length
    79 \end{cfacode}
    80 
    81 Unlike object-oriented monitors, where calling a mutex member \emph{implicitly} acquires mutual-exclusion, \CFA uses an explicit mechanism to acquire mutual-exclusion. A consequence of this approach is that it extends naturally to multi-monitor calls.
    82 \begin{cfacode}
    83 int f(MonitorA & mutex a, MonitorB & mutex b);
    84 
    85 MonitorA a;
    86 MonitorB b;
    87 f(a,b);
    88 \end{cfacode}
    89 The capacity to acquire multiple locks before entering a critical section is called \emph{\gls{group-acquire}}. In practice, writing multi-locking routines that do not lead to deadlocks is tricky. Having language support for such a feature is therefore a significant asset for \CFA. In the case presented above, \CFA guarantees that the order of aquisition is consistent across calls to routines using the same monitors as arguments. However, since \CFA monitors use multi-acquisition locks, users can effectively force the acquiring order. For example, notice which routines use \code{mutex}/\code{nomutex} and how this affects aquiring order:
    90 \begin{cfacode}
    91         void foo(A & mutex a, B & mutex b) { //acquire a & b
    92                 ...
    93         }
    94 
    95         void bar(A & mutex a, B & /*nomutex*/ b) { //acquire a
    96                 ... foo(a, b); ... //acquire b
    97         }
    98 
    99         void baz(A & /*nomutex*/ a, B & mutex b) { //acquire b
    100                 ... foo(a, b); ... //acquire a
    101         }
    102 \end{cfacode}
    103 The multi-acquisition monitor lock allows a monitor lock to be acquired by both \code{bar} or \code{baz} and acquired again in \code{foo}. In the calls to \code{bar} and \code{baz} the monitors are acquired in opposite order.
    104 
    105 However, such use leads the lock acquiring order problem. In the example above, the user uses implicit ordering in the case of function \code{foo} but explicit ordering in the case of \code{bar} and \code{baz}. This subtle mistake means that calling these routines concurrently may lead to deadlock and is therefore undefined behavior. As shown on several occasion\cit, solving this problem requires:
    106 \begin{enumerate}
    107         \item Dynamically tracking of the monitor-call order.
    108         \item Implement rollback semantics.
    109 \end{enumerate}
    110 While the first requirement is already a significant constraint on the system, implementing a general rollback semantics in a C-like language is prohibitively complex \cit. In \CFA, users simply need to be carefull when acquiring multiple monitors at the same time.
    111 
    112 Finally, for convenience, monitors support multiple acquiring, that is acquiring a monitor while already holding it does not cause a deadlock. It simply increments an internal counter which is then used to release the monitor after the number of acquires and releases match up. This is particularly usefull when monitor routines use other monitor routines as helpers or for recursions. For example:
    113 \begin{cfacode}
    114 monitor bank {
    115         int money;
    116         log_t usr_log;
    117 };
    118 
    119 void deposit( bank & mutex b, int deposit ) {
    120         b.money += deposit;
    121         b.usr_log | "Adding" | deposit | endl;
    122 }
    123 
    124 void transfer( bank & mutex mybank, bank & mutex yourbank, int me2you) {
    125         deposit( mybank, -me2you );
    126         deposit( yourbank, me2you );
    127 }
    128 \end{cfacode}
    129 
    130 % ======================================================================
    131 % ======================================================================
    132 \subsection{Data semantics} \label{data}
    133 % ======================================================================
    134 % ======================================================================
    135 Once the call semantics are established, the next step is to establish data semantics. Indeed, until now a monitor is used simply as a generic handle but in most cases monitors contain shared data. This data should be intrinsic to the monitor declaration to prevent any accidental use of data without its appropriate protection. For example, here is a complete version of the counter showed in section \ref{call}:
    136 \begin{cfacode}
    137 monitor counter_t {
    138         int value;
    139 };
    140 
    141 void ?{}(counter_t & this) {
    142         this.cnt = 0;
    143 }
    144 
    145 int ?++(counter_t & mutex this) {
    146         return ++this.value;
    147 }
    148 
    149 //need for mutex is platform dependent here
    150 void ?{}(int * this, counter_t & mutex cnt) {
    151         *this = (int)cnt;
    152 }
    153 \end{cfacode}
    154 
     49monitor counter_t { /*...see section $\ref{data}$...*/ };
     50
     51void ?{}(counter_t & nomutex this); //constructor
     52size_t ++?(counter_t & mutex this); //increment
     53
     54//need for mutex is platform dependent
     55void ?{}(size_t * this, counter_t & mutex cnt); //conversion
     56\end{cfacode}
    15557This counter is used as follows:
    15658\begin{center}
     
    17173Notice how the counter is used without any explicit synchronisation and yet supports thread-safe semantics for both reading and writting.
    17274
    173 % ======================================================================
    174 % ======================================================================
    175 \subsection{Implementation Details: Interaction with polymorphism}
    176 % ======================================================================
    177 % ======================================================================
    178 Depending on the choice of semantics for when monitor locks are acquired, interaction between monitors and \CFA's concept of polymorphism can be complex to support. However, it is shown that entry-point locking solves most of the issues.
    179 
    180 First of all, interaction between \code{otype} polymorphism and monitors is impossible since monitors do not support copying. Therefore, the main question is how to support \code{dtype} polymorphism. Since a monitor's main purpose is to ensure mutual exclusion when accessing shared data, this implies that mutual exclusion is only required for routines that do in fact access shared data. However, since \code{dtype} polymorphism always handles incomplete types (by definition), no \code{dtype} polymorphic routine can access shared data since the data requires knowledge about the type. Therefore, the only concern when combining \code{dtype} polymorphism and monitors is to protect access to routines.
    181 
    182 Before looking into complex control-flow, it is important to present the difference between the two acquiring options : callsite and entry-point locking, i.e. acquiring the monitors before making a mutex routine call or as the first operation of the mutex routine-call. For example:
     75Here, the constructor(\code{?\{\}}) uses the \code{nomutex} keyword to signify that it does not acquire the monitor mutual-exclusion when constructing. This semantics is because an object not yet constructed should never be shared and therefore does not require mutual exclusion. The prefix increment operator uses \code{mutex} to protect the incrementing process from race conditions. Finally, there is a conversion operator from \code{counter_t} to \code{size_t}. This conversion may or may not require the \code{mutex} keyword depending on whether or not reading a \code{size_t} is an atomic operation.
     76
     77For maximum usability, monitors use \gls{multi-acq} semantics, which means a single thread can acquire multiple times the same monitor without deadlock. For example, figure \ref{fig:search} uses recursion and \gls{multi-acq} to print values inside a binary tree.
     78\begin{figure}
     79\label{fig:search}
     80\begin{cfacode}
     81monitor printer { ... };
     82struct tree {
     83        tree * left, right;
     84        char * value;
     85};
     86void print(printer & mutex p, char * v);
     87
     88void print(printer & mutex p, tree * t) {
     89        print(p, t->value);
     90        print(p, t->left );
     91        print(p, t->right);
     92}
     93\end{cfacode}
     94\caption{Recursive printing algorithm using \gls{multi-acq}.}
     95\end{figure}
     96
     97Having both \code{mutex} and \code{nomutex} keywords is redundant based on the meaning of a routine having neither of these keywords. For example, given a routine without qualifiers \code{void foo(counter_t & this)}, then it is reasonable that it should default to the safest option \code{mutex}, whereas assuming \code{nomutex} is unsafe and may cause subtle errors. In fact, \code{nomutex} is the "normal" parameter behaviour, with the \code{nomutex} keyword effectively stating explicitly that "this routine is not special". Another alternative is making exactly one of these keywords mandatory, which would provide the same semantics but without the ambiguity of supporting routines with neither keyword. Mandatory keywords would also have the added benefit of being self-documented but at the cost of extra typing. While there are several benefits to mandatory keywords, they do bring a few challenges. Mandatory keywords in \CFA would imply that the compiler must know without doubt whether or not a parameter is a monitor or not. Since \CFA relies heavily on traits as an abstraction mechanism, the distinction between a type that is a monitor and a type that looks like a monitor can become blurred. For this reason, \CFA only has the \code{mutex} keyword and uses no keyword to mean \code{nomutex}.
     98
     99The next semantic decision is to establish when \code{mutex} may be used as a type qualifier. Consider the following declarations:
     100\begin{cfacode}
     101int f1(monitor & mutex m);
     102int f2(const monitor & mutex m);
     103int f3(monitor ** mutex m);
     104int f4(monitor * mutex m []);
     105int f5(graph(monitor*) & mutex m);
     106\end{cfacode}
     107The problem is to indentify which object(s) should be acquired. Furthermore, each object needs to be acquired only once. In the case of simple routines like \code{f1} and \code{f2} it is easy to identify an exhaustive list of objects to acquire on entry. Adding indirections (\code{f3}) still allows the compiler and programmer to indentify which object is acquired. However, adding in arrays (\code{f4}) makes it much harder. Array lengths are not necessarily known in C, and even then making sure objects are only acquired once becomes none-trivial. This problem can be extended to absurd limits like \code{f5}, which uses a graph of monitors. To make the issue tractable, this project imposes the requirement that a routine may only acquire one monitor per parameter and it must be the type of the parameter with at most one level of indirection (ignoring potential qualifiers). Also note that while routine \code{f3} can be supported, meaning that monitor \code{**m} is be acquired, passing an array to this routine would be type safe and yet result in undefined behavior because only the first element of the array is acquired. However, this ambiguity is part of the C type-system with respects to arrays. For this reason, \code{mutex} is disallowed in the context where arrays may be passed:
     108\begin{cfacode}
     109int f1(monitor & mutex m);   //Okay : recommanded case
     110int f2(monitor * mutex m);   //Okay : could be an array but probably not
     111int f3(monitor mutex m []);  //Not Okay : Array of unkown length
     112int f4(monitor ** mutex m);  //Not Okay : Could be an array
     113int f5(monitor * mutex m []); //Not Okay : Array of unkown length
     114\end{cfacode}
     115Note that not all array functions are actually distinct in the type system sense. However, even the code generation could tell the difference, the extra information is still not sufficient to extend meaningfully the monitor call semantic.
     116
     117Unlike object-oriented monitors, where calling a mutex member \emph{implicitly} acquires mutual-exclusion often receives an object, \CFA uses an explicit mechanism to acquire mutual-exclusion. A consequence of this approach is that it extends naturally to multi-monitor calls.
     118\begin{cfacode}
     119int f(MonitorA & mutex a, MonitorB & mutex b);
     120
     121MonitorA a;
     122MonitorB b;
     123f(a,b);
     124\end{cfacode}
     125The capacity to acquire multiple locks before entering a critical section is called \emph{\gls{bulk-acq}}. In practice, writing multi-locking routines that do not lead to deadlocks is tricky. Having language support for such a feature is therefore a significant asset for \CFA. In the case presented above, \CFA guarantees that the order of aquisition is consistent across calls to routines using the same monitors as arguments. However, since \CFA monitors use \gls{multi-acq} locks, users can effectively force the acquiring order. For example, notice which routines use \code{mutex}/\code{nomutex} and how this affects aquiring order:
     126\begin{cfacode}
     127void foo(A & mutex a, B & mutex b) { //acquire a & b
     128        ...
     129}
     130
     131void bar(A & mutex a, B & /*nomutex*/ b) { //acquire a
     132        ... foo(a, b); ... //acquire b
     133}
     134
     135void baz(A & /*nomutex*/ a, B & mutex b) { //acquire b
     136        ... foo(a, b); ... //acquire a
     137}
     138\end{cfacode}
     139The \gls{multi-acq} monitor lock allows a monitor lock to be acquired by both \code{bar} or \code{baz} and acquired again in \code{foo}. In the calls to \code{bar} and \code{baz} the monitors are acquired in opposite order.
     140
     141However, such use leads to the lock acquiring order problem. In the example above, the user uses implicit ordering in the case of function \code{foo} but explicit ordering in the case of \code{bar} and \code{baz}. This subtle mistake means that calling these routines concurrently may lead to deadlock and is therefore undefined behavior. As shown on several occasion\cit, solving this problem requires:
     142\begin{enumerate}
     143        \item Dynamically tracking of the monitor-call order.
     144        \item Implement rollback semantics.
     145\end{enumerate}
     146While the first requirement is already a significant constraint on the system, implementing a general rollback semantics in a C-like language is prohibitively complex \cit. In \CFA, users simply need to be carefull when acquiring multiple monitors at the same time or only use \gls{bulk-acq} of all the monitors.
     147
     148\Gls{multi-acq} and \gls{bulk-acq} can be used together in interesting ways, for example:
     149\begin{cfacode}
     150monitor bank { ... };
     151
     152void deposit( bank & mutex b, int deposit );
     153
     154void transfer( bank & mutex mybank, bank & mutex yourbank, int me2you) {
     155        deposit( mybank, -me2you );
     156        deposit( yourbank, me2you );
     157}
     158\end{cfacode}
     159This example shows a trivial solution to the bank account transfer problem\cit. Without \gls{multi-acq} and \gls{bulk-acq}, the solution to this problem is much more involved and requires carefull engineering.
     160
     161\subsubsection{\code{mutex} statement} \label{mutex-stmt}
     162
     163The call semantics discussed aboved have one software engineering issue, only a named routine can acquire the mutual-exclusion of a set of monitor. \CFA offers the \code{mutex} statement to workaround the need for unnecessary names, avoiding a major software engineering problem\cit. Listing \ref{lst:mutex-stmt} shows an example of the \code{mutex} statement, which introduces a new scope in which the mutual-exclusion of a set of monitor is acquired. Beyond naming, the \code{mutex} statement has no semantic difference from a routine call with \code{mutex} parameters.
     164
     165\begin{figure}
    183166\begin{center}
    184 \setlength\tabcolsep{1.5pt}
    185 \begin{tabular}{|c|c|c|}
    186 Code & \gls{callsite-locking} & \gls{entry-point-locking} \\
    187 \CFA & pseudo-code & pseudo-code \\
     167\begin{tabular}{|c|c|}
     168function call & \code{mutex} statement \\
    188169\hline
    189170\begin{cfacode}[tabsize=3]
    190 void foo(monitor& mutex a){
    191 
    192 
    193 
    194         //Do Work
    195         //...
    196 
    197 }
    198 
    199 void main() {
    200         monitor a;
    201 
    202 
    203 
    204         foo(a);
    205 
    206 }
    207 \end{cfacode} & \begin{pseudo}[tabsize=3]
    208 foo(& a) {
    209 
    210 
    211 
    212         //Do Work
    213         //...
    214 
    215 }
    216 
    217 main() {
    218         monitor a;
    219         //calling routine
    220         //handles concurrency
    221         acquire(a);
    222         foo(a);
    223         release(a);
    224 }
    225 \end{pseudo} & \begin{pseudo}[tabsize=3]
    226 foo(& a) {
    227         //called routine
    228         //handles concurrency
    229         acquire(a);
    230         //Do Work
    231         //...
    232         release(a);
    233 }
    234 
    235 main() {
    236         monitor a;
    237 
    238 
    239 
    240         foo(a);
    241 
    242 }
    243 \end{pseudo}
     171monitor M {};
     172void foo( M & mutex m ) {
     173        //critical section
     174}
     175
     176void bar( M & m ) {
     177        foo( m );
     178}
     179\end{cfacode}&\begin{cfacode}[tabsize=3]
     180monitor M {};
     181void bar( M & m ) {
     182        mutex(m) {
     183                //critical section
     184        }
     185}
     186
     187
     188\end{cfacode}
    244189\end{tabular}
    245190\end{center}
    246 
    247 \Gls{callsite-locking} is inefficient, since any \code{dtype} routine may have to obtain some lock before calling a routine, depending on whether or not the type passed is a monitor. However, with \gls{entry-point-locking} calling a monitor routine becomes exactly the same as calling it from anywhere else.
    248 
    249 Note the \code{mutex} keyword relies on the resolver, which means that in cases where a generic monitor routine is actually desired, writing a mutex routine is possible with the proper trait. This is possible because monitors are designed in terms a trait. For example:
    250 \begin{cfacode}
    251 //Incorrect
    252 //T is not a monitor
    253 forall(dtype T)
    254 void foo(T * mutex t);
    255 
    256 //Correct
    257 //this function only works on monitors
    258 //(any monitor)
    259 forall(dtype T | is_monitor(T))
    260 void bar(T * mutex t));
     191\caption{Regular call semantics vs. \code{mutex} statement}
     192\label{lst:mutex-stmt}
     193\end{figure}
     194
     195% ======================================================================
     196% ======================================================================
     197\subsection{Data semantics} \label{data}
     198% ======================================================================
     199% ======================================================================
     200Once the call semantics are established, the next step is to establish data semantics. Indeed, until now a monitor is used simply as a generic handle but in most cases monitors contain shared data. This data should be intrinsic to the monitor declaration to prevent any accidental use of data without its appropriate protection. For example, here is a complete version of the counter showed in section \ref{call}:
     201\begin{cfacode}
     202monitor counter_t {
     203        int value;
     204};
     205
     206void ?{}(counter_t & this) {
     207        this.cnt = 0;
     208}
     209
     210int ?++(counter_t & mutex this) {
     211        return ++this.value;
     212}
     213
     214//need for mutex is platform dependent here
     215void ?{}(int * this, counter_t & mutex cnt) {
     216        *this = (int)cnt;
     217}
    261218\end{cfacode}
    262219
     
    267224% ======================================================================
    268225% ======================================================================
    269 In addition to mutual exclusion, the monitors at the core of \CFA's concurrency can also be used to achieve synchronisation. With monitors, this is generally achieved with internal or external scheduling as in\cit. Since internal scheduling of single monitors is mostly a solved problem, this proposal concentraits on extending internal scheduling to multiple monitors at once. Indeed, like the \gls{group-acquire} semantics, internal scheduling extends to multiple monitors at once in a way that is natural to the user but requires additional complexity on the implementation side.
     226In addition to mutual exclusion, the monitors at the core of \CFA's concurrency can also be used to achieve synchronisation. With monitors, this capability is generally achieved with internal or external scheduling as in\cit. Since internal scheduling within a single monitor is mostly a solved problem, this thesis concentrates on extending internal scheduling to multiple monitors. Indeed, like the \gls{bulk-acq} semantics, internal scheduling extends to multiple monitors in a way that is natural to the user but requires additional complexity on the implementation side.
    270227
    271228First, here is a simple example of such a technique:
    272229
    273230\begin{cfacode}
    274         monitor A {
    275                 condition e;
    276         }
    277 
    278         void foo(A & mutex a) {
    279                 ...
    280                 // Wait for cooperation from bar()
    281                 wait(a.e);
    282                 ...
    283         }
    284 
    285         void bar(A & mutex a) {
    286                 // Provide cooperation for foo()
    287                 ...
    288                 // Unblock foo at scope exit
    289                 signal(a.e);
    290         }
    291 \end{cfacode}
    292 
    293 There are two details to note here. First, there \code{signal} is a delayed operation, it only unblocks the waiting thread when it reaches the end of the critical section. This is needed to respect mutual-exclusion. Second, in \CFA, \code{condition} have no particular need to be stored inside a monitor, beyond any software engineering reasons. Here routine \code{foo} waits for the \code{signal} from \code{bar} before making further progress, effectively ensuring a basic ordering.
    294 
    295 An important aspect to take into account here is that \CFA does not allow barging, which means that once function \code{bar} releases the monitor, foo is guaranteed to resume immediately after (unless some other thread waited on the same condition). This guarantees offers the benefit of not having to loop arount waits in order to guarantee that a condition is still met. The main reason \CFA offers this guarantee is that users can easily introduce barging if it becomes a necessity but adding barging prevention or barging avoidance is more involved without language support. Supporting barging prevention as well as extending internal scheduling to multiple monitors is the main source of complexity in the design of \CFA concurrency.
     231monitor A {
     232        condition e;
     233}
     234
     235void foo(A & mutex a) {
     236        ...
     237        //Wait for cooperation from bar()
     238        wait(a.e);
     239        ...
     240}
     241
     242void bar(A & mutex a) {
     243        //Provide cooperation for foo()
     244        ...
     245        //Unblock foo
     246        signal(a.e);
     247}
     248\end{cfacode}
     249
     250There are two details to note here. First, the \code{signal} is a delayed operation, it only unblocks the waiting thread when it reaches the end of the critical section. This semantic is needed to respect mutual-exclusion. Second, in \CFA, a \code{condition} variable can be stored/created independently of a monitor. Here routine \code{foo} waits for the \code{signal} from \code{bar} before making further progress, effectively ensuring a basic ordering.
     251
     252An important aspect of the implementation is that \CFA does not allow barging, which means that once function \code{bar} releases the monitor, foo is guaranteed to resume immediately after (unless some other thread waited on the same condition). This guarantees offers the benefit of not having to loop arount waits in order to guarantee that a condition is still met. The main reason \CFA offers this guarantee is that users can easily introduce barging if it becomes a necessity but adding barging prevention or barging avoidance is more involved without language support. Supporting barging prevention as well as extending internal scheduling to multiple monitors is the main source of complexity in the design of \CFA concurrency.
    296253
    297254% ======================================================================
     
    300257% ======================================================================
    301258% ======================================================================
    302 It is easier to understand the problem of multi-monitor scheduling using a series of pseudo-code. Note that for simplicity in the following snippets of pseudo-code, waiting and signalling is done using an implicit condition variable, like Java built-in monitors.
     259It is easier to understand the problem of multi-monitor scheduling using a series of pseudo-code. Note that for simplicity in the following snippets of pseudo-code, waiting and signalling is done using an implicit condition variable, like Java built-in monitors. Indeed, \code{wait} statements always use a single condition as paremeter and waits on the monitors associated with the condition.
    303260
    304261\begin{multicols}{2}
     
    319276\end{pseudo}
    320277\end{multicols}
    321 The example shows the simple case of having two threads (one for each column) and a single monitor A. One thread acquires before waiting (atomically blocking and releasing A) and the other acquires before signalling. There is an important thing to note here, both \code{wait} and \code{signal} must be called with the proper monitor(s) already acquired. This restriction is hidden on the user side in \uC, as it is a logical requirement for barging prevention.
    322 
    323 A direct extension of the previous example is the \gls{group-acquire} version:
     278The example shows the simple case of having two threads (one for each column) and a single monitor A. One thread acquires before waiting (atomically blocking and releasing A) and the other acquires before signalling. It is important to note here that both \code{wait} and \code{signal} must be called with the proper monitor(s) already acquired. This semantic is a logical requirement for barging prevention.
     279
     280A direct extension of the previous example is a \gls{bulk-acq} version:
    324281
    325282\begin{multicols}{2}
     
    338295\end{pseudo}
    339296\end{multicols}
    340 This version uses \gls{group-acquire} (denoted using the \& symbol), but the presence of multiple monitors does not add a particularly new meaning. Synchronization happens between the two threads in exactly the same way and order. The only difference is that mutual exclusion covers more monitors. On the implementation side, handling multiple monitors does add a degree of complexity as the next few examples demonstrate.
    341 
    342 While deadlock issues can occur when nesting monitors, these issues are only a symptom of the fact that locks, and by extension monitors, are not perfectly composable. However, for monitors as for locks, it is possible to write a program using nesting without encountering any problems if nested is done correctly. For example, the next pseudo-code snippet acquires monitors A then B before waiting while only acquiring B when signalling, effectively avoiding the nested monitor problem.
    343 
     297This version uses \gls{bulk-acq} (denoted using the \& symbol), but the presence of multiple monitors does not add a particularly new meaning. Synchronization happens between the two threads in exactly the same way and order. The only difference is that mutual exclusion covers more monitors. On the implementation side, handling multiple monitors does add a degree of complexity as the next few examples demonstrate.
     298
     299While deadlock issues can occur when nesting monitors, these issues are only a symptom of the fact that locks, and by extension monitors, are not perfectly composable. For monitors, a well known deadlock problem is the Nested Monitor Problem\cit, which occurs when a \code{wait} is made on a thread that holds more than one monitor. For example, the following pseudo-code will run into the nested monitor problem :
    344300\begin{multicols}{2}
    345301\begin{pseudo}
     
    354310
    355311\begin{pseudo}
     312acquire A
     313        acquire B
     314                signal B
     315        release B
     316release A
     317\end{pseudo}
     318\end{multicols}
     319However, for monitors as for locks, it is possible to write a program using nesting without encountering any problems if nesting is done correctly. For example, the next pseudo-code snippet acquires monitors {\sf A} then {\sf B} before waiting, while only acquiring {\sf B} when signalling, effectively avoiding the nested monitor problem.
     320
     321\begin{multicols}{2}
     322\begin{pseudo}
     323acquire A
     324        acquire B
     325                wait B
     326        release B
     327release A
     328\end{pseudo}
     329
     330\columnbreak
     331
     332\begin{pseudo}
    356333
    357334acquire B
     
    362339\end{multicols}
    363340
    364 The next example is where \gls{group-acquire} adds a significant layer of complexity to the internal signalling semantics.
    365 
     341Listing \ref{lst:int-bulk-pseudo} shows an example where \gls{bulk-acq} adds a significant layer of complexity to the internal signalling semantics. Listing \ref{lst:int-bulk-cfa} shows the corresponding \CFA code which implements the pseudo-code in listing \ref{lst:int-bulk-pseudo}. Note that listing \ref{lst:int-bulk-cfa} uses non-\code{mutex} parameter to introduce monitor \code{b} into context. However, for the purpose of translating the given pseudo-code into \CFA-code any method of introducing new monitors into context, other than a \code{mutex} parameter, is acceptable, e.g. global variables, pointer parameters or using locals with the \code{mutex}-statement.
     342
     343\begin{figure}[!b]
    366344\begin{multicols}{2}
    367345Waiting thread
    368346\begin{pseudo}[numbers=left]
    369347acquire A
    370         // Code Section 1
     348        //Code Section 1
    371349        acquire A & B
    372                 // Code Section 2
     350                //Code Section 2
    373351                wait A & B
    374                 // Code Section 3
     352                //Code Section 3
    375353        release A & B
    376         // Code Section 4
     354        //Code Section 4
    377355release A
    378356\end{pseudo}
     
    383361\begin{pseudo}[numbers=left, firstnumber=10]
    384362acquire A
    385         // Code Section 5
     363        //Code Section 5
    386364        acquire A & B
    387                 // Code Section 6
     365                //Code Section 6
    388366                signal A & B
    389                 // Code Section 7
     367                //Code Section 7
    390368        release A & B
    391         // Code Section 8
    392 release A
    393 \end{pseudo}
    394 \end{multicols}
    395 \begin{center}
    396 Listing 1
    397 \end{center}
    398 
    399 It is particularly important to pay attention to code sections 8 and 4, which are where the existing semantics of internal scheduling need to be extended for multiple monitors. The root of the problem is that \gls{group-acquire} is used in a context where one of the monitors is already acquired and is why it is important to define the behaviour of the previous pseudo-code. When the signaller thread reaches the location where it should "release A \& B" (line 16), it must actually transfer ownership of monitor B to the waiting thread. This ownership trasnfer is required in order to prevent barging. Since the signalling thread still needs the monitor A, simply waking up the waiting thread is not an option because it would violate mutual exclusion. There are three options:
     369        //Code Section 8
     370release A
     371\end{pseudo}
     372\end{multicols}
     373\caption{Internal scheduling with \gls{bulk-acq}}
     374\label{lst:int-bulk-pseudo}
     375\end{figure}
     376
     377\begin{figure}[!b]
     378\begin{multicols}{2}
     379Waiting thread
     380\begin{cfacode}
     381monitor A;
     382monitor B;
     383extern condition c;
     384void foo(A & mutex a, B & b) {
     385        //Code Section 1
     386        mutex(a, b) {
     387                //Code Section 2
     388                wait(c);
     389                //Code Section 3
     390        }
     391        //Code Section 4
     392}
     393\end{cfacode}
     394
     395\columnbreak
     396
     397Signalling thread
     398\begin{cfacode}
     399monitor A;
     400monitor B;
     401extern condition c;
     402void foo(A & mutex a, B & b) {
     403        //Code Section 5
     404        mutex(a, b) {
     405                //Code Section 6
     406                signal(c);
     407                //Code Section 7
     408        }
     409        //Code Section 8
     410}
     411\end{cfacode}
     412\end{multicols}
     413\caption{Equivalent \CFA code for listing \ref{lst:int-bulk-pseudo}}
     414\label{lst:int-bulk-cfa}
     415\end{figure}
     416
     417It is particularly important to pay attention to code sections 4 and 8, which are where the existing semantics of internal scheduling need to be extended for multiple monitors. The root of the problem is that \gls{bulk-acq} is used in a context where one of the monitors is already acquired and is why it is important to define the behaviour of the previous pseudo-code. When the signaller thread reaches the location where it should "release A \& B" (line 16), it must actually transfer ownership of monitor B to the waiting thread. This ownership trasnfer is required in order to prevent barging. Since the signalling thread still needs monitor A, simply waking up the waiting thread is not an option because it would violate mutual exclusion. There are three options.
    400418
    401419\subsubsection{Delaying signals}
    402 The first more obvious solution to solve the problem of multi-monitor scheduling is to keep ownership of all locks until the last lock is ready to be transferred. It can be argued that that moment is the correct time to transfer ownership when the last lock is no longer needed because this semantics fits most closely to the behaviour of single monitor scheduling. This solution has the main benefit of transferring ownership of groups of monitors, which simplifies the semantics from mutiple objects to a single group of object, effectively making the existing single monitor semantic viable by simply changing monitors to monitor collections.
     420The first more obvious solution to solve the problem of multi-monitor scheduling is to keep ownership of all locks until the last lock is ready to be transferred. It can be argued that that moment is the correct time to transfer ownership when the last lock is no longer needed because this semantics fits most closely to the behaviour of single monitor scheduling. This solution has the main benefit of transferring ownership of groups of monitors, which simplifies the semantics from mutiple objects to a single group of objects, effectively making the existing single monitor semantic viable by simply changing monitors to monitor groups.
    403421\begin{multicols}{2}
    404422Waiter
     
    424442\end{pseudo}
    425443\end{multicols}
    426 However, this solution can become much more complicated depending on what is executed while secretly holding B (at line 10). Indeed, nothing prevents a user from signalling monitor A on a different condition variable:
    427 \newpage
     444However, this solution can become much more complicated depending on what is executed while secretly holding B (at line 10). Indeed, nothing prevents signalling monitor A on a different condition variable:
    428445\begin{multicols}{2}
    429446Thread 1
     
    446463
    447464Thread 3
    448 \begin{pseudo}[numbers=left, firstnumber=10]
     465\begin{pseudo}[numbers=left, firstnumber=9]
    449466acquire A
    450467        acquire A & B
     
    467484Note that ordering is not determined by a race condition but by whether signalled threads are enqueued in FIFO or FILO order. However, regardless of the answer, users can move line 15 before line 11 and get the reverse effect.
    468485
    469 In both cases, the threads need to be able to distinguish on a per monitor basis which ones need to be released and which ones need to be transferred. Which means monitors cannot be handled as a single homogenous group.
     486In both cases, the threads need to be able to distinguish, on a per monitor basis, which ones need to be released and which ones need to be transferred, which means monitors cannot be handled as a single homogenous group and therefore invalidates the main benefit of this approach.
    470487
    471488\subsubsection{Dependency graphs}
    472 In the Listing 1 pseudo-code, there is a solution which statisfies both barging prevention and mutual exclusion. If ownership of both monitors is transferred to the waiter when the signaller releases A and then the waiter transfers back ownership of A when it releases it then the problem is solved. Dynamically finding the correct order is therefore the second possible solution. The problem it encounters is that it effectively boils down to resolving a dependency graph of ownership requirements. Here even the simplest of code snippets requires two transfers and it seems to increase in a manner closer to polynomial. For example, the following code, which is just a direct extension to three monitors, requires at least three ownership transfer and has multiple solutions:
     489In the Listing 1 pseudo-code, there is a solution which statisfies both barging prevention and mutual exclusion. If ownership of both monitors is transferred to the waiter when the signaller releases A and then the waiter transfers back ownership of A when it releases it, then the problem is solved. Dynamically finding the correct order is therefore the second possible solution. The problem it encounters is that it effectively boils down to resolving a dependency graph of ownership requirements. Here even the simplest of code snippets requires two transfers and it seems to increase in a manner closer to polynomial. For example, the following code, which is just a direct extension to three monitors, requires at least three ownership transfer and has multiple solutions:
    473490
    474491\begin{multicols}{2}
     
    495512\end{pseudo}
    496513\end{multicols}
    497 Resolving dependency graph being a complex and expensive endeavour, this solution is not the preffered one.
    498 
    499 \subsubsection{Partial signalling} \label{partial-sig}
    500 Finally, the solution that is chosen for \CFA is to use partial signalling. Consider the following case:
    501 
    502 \begin{multicols}{2}
    503 \begin{pseudo}[numbers=left]
     514
     515\begin{figure}
     516\begin{multicols}{3}
     517Thread $\alpha$
     518\begin{pseudo}[numbers=left, firstnumber=1]
    504519acquire A
    505520        acquire A & B
     
    511526\columnbreak
    512527
    513 \begin{pseudo}[numbers=left, firstnumber=6]
     528Thread $\gamma$
     529\begin{pseudo}[numbers=left, firstnumber=1]
    514530acquire A
    515531        acquire A & B
    516532                signal A & B
    517533        release A & B
    518         // ... More code
    519 release A
    520 \end{pseudo}
    521 \end{multicols}
    522 The partial signalling solution transfers ownership of monitor B at lines 10 but does not wake the waiting thread since it is still using monitor A. Only when it reaches line 11 does it actually wakeup the waiting thread. This solution has the benefit that complexity is encapsulated into only two actions, passing monitors to the next owner when they should be release and conditionnaly waking threads if all conditions are met. Contrary to the other solutions, this solution quickly hits an upper bound on complexity of implementation.
     534        signal A
     535release A
     536\end{pseudo}
     537
     538\columnbreak
     539
     540Thread $\beta$
     541\begin{pseudo}[numbers=left, firstnumber=1]
     542acquire A
     543        wait A
     544release A
     545\end{pseudo}
     546
     547\end{multicols}
     548\caption{Dependency graph}
     549\label{lst:dependency}
     550\end{figure}
     551
     552\begin{figure}
     553\begin{center}
     554\input{dependency}
     555\end{center}
     556\label{fig:dependency}
     557\caption{Dependency graph of the statements in listing \ref{lst:dependency}}
     558\end{figure}
     559
     560Listing \ref{lst:dependency} is the three thread example rewritten for dependency graphs as well as the corresponding dependency graph. Figure \ref{fig:dependency} shows the corresponding dependency graph that results, where every node is a statement of one of the three threads, and the arrows the dependency of that statement. The extra challenge is that this dependency graph is effectively post-mortem, but the run time system needs to be able to build and solve these graphs as the dependency unfolds. Resolving dependency graph being a complex and expensive endeavour, this solution is not the preffered one.
     561
     562\subsubsection{Partial signalling} \label{partial-sig}
     563Finally, the solution that is chosen for \CFA is to use partial signalling. Consider the following case:
     564
     565\begin{multicols}{2}
     566\begin{pseudo}[numbers=left]
     567acquire A
     568        acquire A & B
     569                wait A & B
     570        release A & B
     571release A
     572\end{pseudo}
     573
     574\columnbreak
     575
     576\begin{pseudo}[numbers=left, firstnumber=6]
     577acquire A
     578        acquire A & B
     579                signal A & B
     580        release A & B
     581        //... More code
     582release A
     583\end{pseudo}
     584\end{multicols}
     585The partial signalling solution transfers ownership of monitor B at lines 10 but does not wake the waiting thread since it is still using monitor A. Only when it reaches line 11 does it actually wakeup the waiting thread. This solution has the benefit that complexity is encapsulated into only two actions, passing monitors to the next owner when they should be release and conditionally waking threads if all conditions are met. This solution has a much simpler implementation than a dependency graph solving algorithm which is why it was chosen.
    523586
    524587% ======================================================================
     
    529592An important note is that, until now, signalling a monitor was a delayed operation. The ownership of the monitor is transferred only when the monitor would have otherwise been released, not at the point of the \code{signal} statement. However, in some cases, it may be more convenient for users to immediately transfer ownership to the thread that is waiting for cooperation, which is achieved using the \code{signal_block} routine\footnote{name to be discussed}.
    530593
    531 For example here is an example highlighting the difference in behaviour:
    532 \begin{center}
     594The example in listing \ref{lst:datingservice} highlights the difference in behaviour. As mentioned, \code{signal} only transfers ownership once the current critical section exits, this behaviour cause the need for additional synchronisation when a two-way handshake is needed. To avoid this extraneous synchronisation, the \code{condition} type offers the \code{signal_block} routine which handle two-way handshakes as shown in the example. This removes the need for a second condition variables and simplifies programming. Like every other monitor semantic, \code{signal_block} uses barging prevention which means mutual-exclusion is baton-passed both on the frond-end and the back-end of the call to \code{signal_block}, meaning no other thread can acquire the monitor neither before nor after the call.
     595\begin{figure}
    533596\begin{tabular}{|c|c|}
    534597\code{signal} & \code{signal_block} \\
    535598\hline
    536 \begin{cfacode}
    537 monitor M { int val; };
    538 
    539 void foo(M & mutex m ) {
    540         m.val++;
    541         sout| "Foo:" | m.val |endl;
    542 
    543         wait( c );
    544 
    545         m.val++;
    546         sout| "Foo:" | m.val |endl;
    547 }
    548 
    549 void bar(M & mutex m ) {
    550         m.val++;
    551         sout| "Bar:" | m.val |endl;
    552 
    553         signal( c );
    554 
    555         m.val++;
    556         sout| "Bar:" | m.val |endl;
    557 }
    558 \end{cfacode}&\begin{cfacode}
    559 monitor M { int val; };
    560 
    561 void foo(M & mutex m ) {
    562         m.val++;
    563         sout| "Foo:" | m.val |endl;
    564 
    565         wait( c );
    566 
    567         m.val++;
    568         sout| "Foo:" | m.val |endl;
    569 }
    570 
    571 void bar(M & mutex m ) {
    572         m.val++;
    573         sout| "Bar:" | m.val |endl;
    574 
    575         signal_block( c );
    576 
    577         m.val++;
    578         sout| "Bar:" | m.val |endl;
     599\begin{cfacode}[tabsize=3]
     600monitor DatingService
     601{
     602        //compatibility codes
     603        enum{ CCodes = 20 };
     604
     605        int girlPhoneNo
     606        int boyPhoneNo;
     607};
     608
     609condition girls[CCodes];
     610condition boys [CCodes];
     611condition exchange;
     612
     613int girl(int phoneNo, int ccode)
     614{
     615        //no compatible boy ?
     616        if(empty(boys[ccode]))
     617        {
     618                //wait for boy
     619                wait(girls[ccode]);
     620
     621                //make phone number available
     622                girlPhoneNo = phoneNo;
     623
     624                //wake boy fron chair
     625                signal(exchange);
     626        }
     627        else
     628        {
     629                //make phone number available
     630                girlPhoneNo = phoneNo;
     631
     632                //wake boy
     633                signal(boys[ccode]);
     634
     635                //sit in chair
     636                wait(exchange);
     637        }
     638        return boyPhoneNo;
     639}
     640
     641int boy(int phoneNo, int ccode)
     642{
     643        //same as above
     644        //with boy/girl interchanged
     645}
     646\end{cfacode}&\begin{cfacode}[tabsize=3]
     647monitor DatingService
     648{
     649        //compatibility codes
     650        enum{ CCodes = 20 };
     651
     652        int girlPhoneNo;
     653        int boyPhoneNo;
     654};
     655
     656condition girls[CCodes];
     657condition boys [CCodes];
     658//exchange is not needed
     659
     660int girl(int phoneNo, int ccode)
     661{
     662        //no compatible boy ?
     663        if(empty(boys[ccode]))
     664        {
     665                //wait for boy
     666                wait(girls[ccode]);
     667
     668                //make phone number available
     669                girlPhoneNo = phoneNo;
     670
     671                //wake boy fron chair
     672                signal(exchange);
     673        }
     674        else
     675        {
     676                //make phone number available
     677                girlPhoneNo = phoneNo;
     678
     679                //wake boy
     680                signal_block(boys[ccode]);
     681
     682                //second handshake unnecessary
     683
     684        }
     685        return boyPhoneNo;
     686}
     687
     688int boy(int phoneNo, int ccode)
     689{
     690        //same as above
     691        //with boy/girl interchanged
    579692}
    580693\end{cfacode}
    581694\end{tabular}
    582 \end{center}
    583 Assuming that \code{val} is initialized at 0, that each routine are called from seperate thread and that \code{foo} is always called first. The previous code would yield the following output:
    584 
    585 \begin{center}
    586 \begin{tabular}{|c|c|}
    587 \code{signal} & \code{signal_block} \\
    588 \hline
    589 \begin{pseudo}
    590 Foo: 0
    591 Bar: 1
    592 Bar: 2
    593 Foo: 3
    594 \end{pseudo}&\begin{pseudo}
    595 Foo: 0
    596 Bar: 1
    597 Foo: 2
    598 Bar: 3
    599 \end{pseudo}
    600 \end{tabular}
    601 \end{center}
    602 
    603 As mentionned, \code{signal} only transfers ownership once the current critical section exits, resulting in the second "Bar" line to be printed before the second "Foo" line. On the other hand, \code{signal_block} immediately transfers ownership to \code{bar}, causing an inversion of output. Obviously this means that \code{signal_block} is a blocking call, which will only be resumed once the signalled function exits the critical section.
    604 
    605 % ======================================================================
    606 % ======================================================================
    607 \subsection{Internal scheduling: Implementation} \label{inschedimpl}
    608 % ======================================================================
    609 % ======================================================================
    610 There are several challenges specific to \CFA when implementing internal scheduling. These challenges are direct results of \gls{group-acquire} and loose object definitions. These two constraints are to root cause of most design decisions in the implementation of internal scheduling. Furthermore, to avoid the head-aches of dynamically allocating memory in a concurrent environment, the internal-scheduling design is entirely free of mallocs and other dynamic memory allocation scheme. This is to avoid the chicken and egg problem of having a memory allocator that relies on the threading system and a threading system that relies on the runtime. This extra goal, means that memory management is a constant concern in the design of the system.
    611 
    612 The main memory concern for concurrency is queues. All blocking operations are made by parking threads onto queues. These queues need to be intrinsic\cit to avoid the need memory allocation. This entails that all the fields needed to keep track of all needed information. Since internal scheduling can use an unbound amount of memory (depending on \gls{group-acquire}) statically defining information information in the intrusive fields of threads is insufficient. The only variable sized container that does not require memory allocation is the callstack, which is heavily used in the implementation of internal scheduling. Particularly the GCC extension variable length arrays which is used extensively.
    613 
    614 Since stack allocation is based around scope, the first step of the implementation is to identify the scopes that are available to store the information, and which of these can have a variable length. In the case of external scheduling, the threads and the condition both allow a fixed amount of memory to be stored, while mutex-routines and the actual blocking call allow for an unbound amount (though adding too much to the mutex routine stack size can become expansive faster).
    615 
    616 The following figure is the traditionnal illustration of a monitor :
    617 
    618 \begin{center}
    619 {\resizebox{0.4\textwidth}{!}{\input{monitor}}}
    620 \end{center}
    621 
    622 For \CFA, the previous picture does not have support for blocking multiple monitors on a single condition. To support \gls{group-acquire} two changes to this picture are required. First, it doesn't make sense to tie the condition to a single monitor since blocking two monitors as one would require arbitrarily picking a monitor to hold the condition. Secondly, the object waiting on the conditions and AS-stack cannot simply contain the waiting thread since a single thread can potentially wait on multiple monitors. As mentionned in section \ref{inschedimpl}, the handling in multiple monitors is done by partially passing, which entails that each concerned monitor needs to have a node object. However, for waiting on the condition, since all threads need to wait together, a single object needs to be queued in the condition. Moving out the condition and updating the node types yields :
    623 
    624 \begin{center}
    625 {\resizebox{0.8\textwidth}{!}{\input{int_monitor}}}
    626 \end{center}
    627 
    628 \newpage
    629 
    630 This picture and the proper entry and leave algorithms is the fundamental implementation of internal scheduling.
    631 
    632 \begin{multicols}{2}
    633 Entry
    634 \begin{pseudo}[numbers=left]
    635 if monitor is free
    636         enter
    637 elif I already own the monitor
    638         continue
    639 else
    640         block
    641 increment recursion
    642 
    643 \end{pseudo}
    644 \columnbreak
    645 Exit
    646 \begin{pseudo}[numbers=left, firstnumber=8]
    647 decrement recursion
    648 if recursion == 0
    649         if signal_stack not empty
    650                 set_owner to thread
    651                 if all monitors ready
    652                         wake-up thread
    653 
    654         if entry queue not empty
    655                 wake-up thread
    656 \end{pseudo}
    657 \end{multicols}
    658 
    659 Some important things to notice about the exit routine. The solution discussed in \ref{inschedimpl} can be seen on line 11 of the previous pseudo code. Basically, the solution boils down to having a seperate data structure for the condition queue and the AS-stack, and unconditionally transferring ownership of the monitors but only unblocking the thread when the last monitor has trasnferred ownership. This solution is safe as well as preventing any potential barging.
     695\caption{Dating service example using \code{signal} and \code{signal_block}. }
     696\label{lst:datingservice}
     697\end{figure}
    660698
    661699% ======================================================================
     
    700738\end{tabular}
    701739\end{center}
    702 This method is more constrained and explicit, which may help users tone down the undeterministic nature of concurrency. Indeed, as the following examples demonstrates, external scheduling allows users to wait for events from other threads without the concern of unrelated events occuring. External scheduling can generally be done either in terms of control flow (e.g., \uC) or in terms of data (e.g. Go). Of course, both of these paradigms have their own strenghts and weaknesses but for this project control-flow semantics were chosen to stay consistent with the rest of the languages semantics. Two challenges specific to \CFA arise when trying to add external scheduling with loose object definitions and multi-monitor routines. The previous example shows a simple use \code{_Accept} versus \code{wait}/\code{signal} and its advantages. Note that while other languages often use \code{accept} as the core external scheduling keyword, \CFA uses \code{waitfor} to prevent name collisions with existing socket APIs.
    703 
    704 In the case of internal scheduling, the call to \code{wait} only guarantees that \code{V} is the last routine to access the monitor. This entails that the routine \code{V} may have acquired mutual exclusion several times while routine \code{P} was waiting. On the other hand, external scheduling guarantees that while routine \code{P} was waiting, no routine other than \code{V} could acquire the monitor.
     740This method is more constrained and explicit, which helps users tone down the undeterministic nature of concurrency. Indeed, as the following examples demonstrates, external scheduling allows users to wait for events from other threads without the concern of unrelated events occuring. External scheduling can generally be done either in terms of control flow (e.g., \uC with \code{_Accept}) or in terms of data (e.g. Go with channels). Of course, both of these paradigms have their own strenghts and weaknesses but for this project control-flow semantics were chosen to stay consistent with the rest of the languages semantics. Two challenges specific to \CFA arise when trying to add external scheduling with loose object definitions and multi-monitor routines. The previous example shows a simple use \code{_Accept} versus \code{wait}/\code{signal} and its advantages. Note that while other languages often use \code{accept}/\code{select} as the core external scheduling keyword, \CFA uses \code{waitfor} to prevent name collisions with existing socket \acrshort{api}s.
     741
     742In the case of internal scheduling, the call to \code{wait} only guarantees that \code{V} is the last routine to access the monitor. This entails that a third routine, say \code{isInUse()}, may have acquired mutual exclusion several times while routine \code{P} was waiting. On the other hand, external scheduling guarantees that while routine \code{P} was waiting, no routine other than \code{V} could acquire the monitor.
    705743
    706744% ======================================================================
     
    712750
    713751\begin{cfacode}
    714         monitor A {};
    715 
    716         void f(A & mutex a);
    717         void f(int a, float b);
    718         void g(A & mutex a) {
    719                 waitfor(f); // Less obvious which f() to wait for
    720         }
     752monitor A {};
     753
     754void f(A & mutex a);
     755void g(A & mutex a) {
     756        waitfor(f); //Obvious which f() to wait for
     757}
     758
     759void f(A & mutex a, int); //New different F added in scope
     760void h(A & mutex a) {
     761        waitfor(f); //Less obvious which f() to wait for
     762}
    721763\end{cfacode}
    722764
     
    728770        if monitor is free
    729771                enter
    730         elif I already own the monitor
     772        elif already own the monitor
    731773                continue
    732774        elif monitor accepts me
     
    738780\end{center}
    739781
    740 For the fist two conditions, it is easy to implement a check that can evaluate the condition in a few instruction. However, a fast check for \pscode{monitor accepts me} is much harder to implement depending on the constraints put on the monitors. Indeed, monitors are often expressed as an entry queue and some acceptor queue as in the following figure:
     782For the first two conditions, it is easy to implement a check that can evaluate the condition in a few instruction. However, a fast check for \pscode{monitor accepts me} is much harder to implement depending on the constraints put on the monitors. Indeed, monitors are often expressed as an entry queue and some acceptor queue as in the following figure:
    741783
    742784\begin{center}
     
    744786\end{center}
    745787
    746 There are other alternatives to these pictures but in the case of this picture implementing a fast accept check is relatively easy. Indeed simply updating a bitmask when the acceptor queue changes is enough to have a check that executes in a single instruction, even with a fairly large number (e.g. 128) of mutex members. This technique cannot be used in \CFA because it relies on the fact that the monitor type declares all the acceptable routines. For OO languages this does not compromise much since monitors already have an exhaustive list of member routines. However, for \CFA this is not the case; routines can be added to a type anywhere after its declaration. Its important to note that the bitmask approach does not actually require an exhaustive list of routines, but it requires a dense unique ordering of routines with an upper-bound and that ordering must be consistent across translation units.
    747 The alternative would be to have a picture more like this one:
     788There are other alternatives to these pictures, but in the case of this picture, implementing a fast accept check is relatively easy. Indeed simply updating a bitmask when the acceptor queue changes is enough to have a check that executes in a single instruction, even with a fairly large number (e.g. 128) of mutex members. This technique cannot be used in \CFA because it relies on the fact that the monitor type declares all the acceptable routines. For OO languages this does not compromise much since monitors already have an exhaustive list of member routines. However, for \CFA this is not the case; routines can be added to a type anywhere after its declaration. Its important to note that the bitmask approach does not actually require an exhaustive list of routines, but it requires a dense unique ordering of routines with an upper-bound and that ordering must be consistent across translation units.
     789The alternative is to have a picture like this one:
    748790
    749791\begin{center}
     
    751793\end{center}
    752794
    753 Not storing the queues inside the monitor means that the storage can vary between routines, allowing for more flexibility and extensions. Storing an array of function-pointers would solve the issue of uniquely identifying acceptable routines. However, the single instruction bitmask compare has been replaced by dereferencing a pointer followed by a linear search. Furthermore, supporting nested external scheduling may now require additionnal searches on calls to waitfor to check if a routine is already queued in.
    754 
    755 At this point we must make a decision between flexibility and performance. Many design decisions in \CFA achieve both flexibility and performance, for example polymorphic routines add significant flexibility but inlining them means the optimizer can easily remove any runtime cost. Here however, the cost of flexibility cannot be trivially removed. In the end, the most flexible approach has been chosen since it allows users to write programs that would otherwise be prohibitively hard to write. This is based on the assumption that writing fast but inflexible locks is closer to a solved problems than writing locks that are as flexible as external scheduling in \CFA.
    756 
    757 Another aspect to consider is what happens if multiple overloads of the same routine are used. For the time being it is assumed that multiple overloads of the same routine are considered as distinct routines. However, this could easily be extended in the future.
     795Not storing the mask inside the monitor means that the storage for the mask information can vary between calls to \code{waitfor}, allowing for more flexibility and extensions. Storing an array of function-pointers would solve the issue of uniquely identifying acceptable routines. However, the single instruction bitmask compare has been replaced by dereferencing a pointer followed by a linear search. Furthermore, supporting nested external scheduling may now require additionnal searches on calls to waitfor to check if a routine is already queued in.
     796
     797Note that in the second picture, tasks need to always keep track of through which routine they are attempting to acquire the monitor and the routine mask needs to have both a function pointer and a set of monitors, as will be discussed in the next section. These details where omitted from the picture for the sake of simplifying the representation.
     798
     799At this point we must make a decision between flexibility and performance. Many design decisions in \CFA achieve both flexibility and performance, for example polymorphic routines add significant flexibility but inlining them means the optimizer can easily remove any runtime cost. Here however, the cost of flexibility cannot be trivially removed. In the end, the most flexible approach has been chosen since it allows users to write programs that would otherwise be prohibitively hard to write. This decision is based on the assumption that writing fast but inflexible locks is closer to a solved problems than writing locks that are as flexible as external scheduling in \CFA.
    758800
    759801% ======================================================================
     
    763805% ======================================================================
    764806
    765 External scheduling, like internal scheduling, becomes orders of magnitude more complex when we start introducing multi-monitor syntax. Even in the simplest possible case some new semantics need to be established:
    766 \begin{cfacode}
    767         mutex struct A {};
    768 
    769         mutex struct B {};
    770 
    771         void g(A & mutex a, B & mutex b) {
    772                 waitfor(f); //ambiguous, which monitor
    773         }
     807External scheduling, like internal scheduling, becomes significantly more complex when introducing multi-monitor syntax. Even in the simplest possible case, some new semantics need to be established:
     808\begin{cfacode}
     809monitor M {};
     810
     811void f(M & mutex a);
     812
     813void g(M & mutex a, M & mutex b) {
     814        waitfor(f); //ambiguous, keep a pass b or other way around?
     815}
    774816\end{cfacode}
    775817
     
    777819
    778820\begin{cfacode}
    779         mutex struct A {};
    780 
    781         mutex struct B {};
    782 
    783         void g(A & mutex a, B & mutex b) {
    784                 waitfor( f, b );
    785         }
    786 \end{cfacode}
    787 
    788 This is unambiguous. Both locks will be acquired and kept, when routine \code{f} is called the lock for monitor \code{b} will be temporarily transferred from \code{g} to \code{f} (while \code{g} still holds lock \code{a}). This behavior can be extended to multi-monitor waitfor statment as follows.
    789 
    790 \begin{cfacode}
    791         mutex struct A {};
    792 
    793         mutex struct B {};
    794 
    795         void g(A & mutex a, B & mutex b) {
    796                 waitfor( f, a, b);
    797         }
    798 \end{cfacode}
    799 
    800 Note that the set of monitors passed to the \code{waitfor} statement must be entirely contained in the set of monitor already acquired in the routine. \code{waitfor} used in any other context is Undefined Behaviour.
    801 
    802 An important behavior to note is that what happens when set of monitors only match partially :
    803 
    804 \begin{cfacode}
    805         mutex struct A {};
    806 
    807         mutex struct B {};
    808 
    809         void g(A & mutex a, B & mutex b) {
    810                 waitfor(f, a, b);
    811         }
    812 
    813         A a1, a2;
    814         B b;
    815 
    816         void foo() {
    817                 g(a1, b);
    818         }
    819 
    820         void bar() {
    821                 f(a2, b);
    822         }
    823 \end{cfacode}
    824 
    825 While the equivalent can happen when using internal scheduling, the fact that conditions are branded on first use means that users have to use two different condition variables. In both cases, partially matching monitor sets will not wake-up the waiting thread. It is also important to note that in the case of external scheduling, as for routine calls, the order of parameters is important; \code{waitfor(f,a,b)} and \code{waitfor(f,b,a)} are to distinct waiting condition.
    826 
    827 % ======================================================================
    828 % ======================================================================
    829 \subsection{Implementation Details: External scheduling queues}
    830 % ======================================================================
    831 % ======================================================================
    832 To support multi-monitor external scheduling means that some kind of entry-queues must be used that is aware of both monitors. However, acceptable routines must be aware of the entry queues which means they must be stored inside at least one of the monitors that will be acquired. This in turn adds the requirement a systematic algorithm of disambiguating which queue is relavant regardless of user ordering. The proposed algorithm is to fall back on monitors lock ordering and specify that the monitor that is acquired first is the lock with the relevant entry queue. This assumes that the lock acquiring order is static for the lifetime of all concerned objects but that is a reasonable constraint. This algorithm choice has two consequences, the entry queue of the highest priority monitor is no longer a true FIFO queue and the queue of the lowest priority monitor is both required and probably unused. The queue can no longer be a FIFO queue because instead of simply containing the waiting threads in order arrival, they also contain the second mutex. Therefore, another thread with the same highest priority monitor but a different lowest priority monitor may arrive first but enter the critical section after a thread with the correct pairing. Secondly, since it may not be known at compile time which monitor will be the lowest priority monitor, every monitor needs to have the correct queues even though it is probable that half the multi-monitor queues will go unused for the entire duration of the program.
    833 
    834 % ======================================================================
    835 % ======================================================================
    836 \section{Other concurrency tools}
    837 % ======================================================================
    838 % ======================================================================
    839 % \TODO
     821monitor M {};
     822
     823void f(M & mutex a);
     824
     825void g(M & mutex a, M & mutex b) {
     826        waitfor( f, b );
     827}
     828\end{cfacode}
     829
     830This syntax is unambiguous. Both locks are acquired and kept. When routine \code{f} is called, the lock for monitor \code{b} is temporarily transferred from \code{g} to \code{f} (while \code{g} still holds lock \code{a}). This behavior can be extended to multi-monitor waitfor statement as follows.
     831
     832\begin{cfacode}
     833monitor M {};
     834
     835void f(M & mutex a, M & mutex b);
     836
     837void g(M & mutex a, M & mutex b) {
     838        waitfor( f, a, b);
     839}
     840\end{cfacode}
     841
     842Note that the set of monitors passed to the \code{waitfor} statement must be entirely contained in the set of monitors already acquired in the routine. \code{waitfor} used in any other context is Undefined Behaviour.
     843
     844An important behavior to note is that what happens when a set of monitors only match partially :
     845
     846\begin{cfacode}
     847mutex struct A {};
     848
     849mutex struct B {};
     850
     851void g(A & mutex a, B & mutex b) {
     852        waitfor(f, a, b);
     853}
     854
     855A a1, a2;
     856B b;
     857
     858void foo() {
     859        g(a1, b); //block on accept
     860}
     861
     862void bar() {
     863        f(a2, b); //fufill cooperation
     864}
     865\end{cfacode}
     866
     867While the equivalent can happen when using internal scheduling, the fact that conditions are specific to a set of monitors means that users have to use two different condition variables. In both cases, partially matching monitor sets does not wake-up the waiting thread. It is also important to note that in the case of external scheduling, as for routine calls, the order of parameters is important; \code{waitfor(f,a,b)} and \code{waitfor(f,b,a)} are to distinct waiting condition.
     868
     869% ======================================================================
     870% ======================================================================
     871\subsection{\code{waitfor} semantics}
     872% ======================================================================
     873% ======================================================================
     874
     875Syntactically, the \code{waitfor} statement takes a function identifier and a set of monitors. While the set of monitors can be any list of expression, the function name is more restricted. This is because the compiler validates at compile time the validity of the waitfor statement. It checks that the set of monitor passed in matches the requirements for a function call. Listing \ref{lst:waitfor} shows various usage of the waitfor statement and which are acceptable. The choice of the function type is made ignoring any non-\code{mutex} parameter. One limitation of the current implementation is that it does not handle overloading.
     876\begin{figure}
     877\begin{cfacode}
     878monitor A{};
     879monitor B{};
     880
     881void f1( A & mutex );
     882void f2( A & mutex, B & mutex );
     883void f3( A & mutex, int );
     884void f4( A & mutex, int );
     885void f4( A & mutex, double );
     886
     887void foo( A & mutex a1, A & mutex a2, B & mutex b1, B & b2 ) {
     888        A * ap = & a1;
     889        void (*fp)( A & mutex ) = f1;
     890
     891        waitfor(f1, a1);     //Correct : 1 monitor case
     892        waitfor(f2, a1, b1); //Correct : 2 monitor case
     893        waitfor(f3, a1);     //Correct : non-mutex arguments are ignored
     894        waitfor(f1, *ap);    //Correct : expression as argument
     895
     896        waitfor(f1, a1, b1); //Incorrect : Too many mutex arguments
     897        waitfor(f2, a1);     //Incorrect : Too few mutex arguments
     898        waitfor(f2, a1, a2); //Incorrect : Mutex arguments don't match
     899        waitfor(f1, 1);      //Incorrect : 1 not a mutex argument
     900        waitfor(f4, a1);     //Incorrect : f9 not a function
     901        waitfor(*fp, a1 );   //Incorrect : fp not a identifier
     902        waitfor(f4, a1);     //Incorrect : f4 ambiguous
     903
     904        waitfor(f2, a1, b2); //Undefined Behaviour : b2 may not acquired
     905}
     906\end{cfacode}
     907\caption{Various correct and incorrect uses of the waitfor statement}
     908\label{lst:waitfor}
     909\end{figure}
     910
     911Finally, for added flexibility, \CFA supports constructing complex waitfor mask using the \code{or}, \code{timeout} and \code{else}. Indeed, multiple \code{waitfor} can be chained together using \code{or}; this chain will form a single statement which will baton-pass to any one function that fits one of the function+monitor set which was passed in. To eanble users to tell which was the accepted function, \code{waitfor}s are followed by a statement (including the null statement \code{;}) or a compound statement. When multiple \code{waitfor} are chained together, only the statement corresponding to the accepted function is executed. A \code{waitfor} chain can also be followed by a \code{timeout}, to signify an upper bound on the wait, or an \code{else}, to signify that the call should be non-blocking, that is only check of a matching function already arrived and return immediately otherwise. Any and all of these clauses can be preceded by a \code{when} condition to dynamically construct the mask based on some current state. Listing \ref{lst:waitfor2}, demonstrates several complex masks and some incorrect ones.
     912
     913\begin{figure}
     914\begin{cfacode}
     915monitor A{};
     916
     917void f1( A & mutex );
     918void f2( A & mutex );
     919
     920void foo( A & mutex a, bool b, int t ) {
     921        //Correct : blocking case
     922        waitfor(f1, a);
     923
     924        //Correct : block with statement
     925        waitfor(f1, a) {
     926                sout | "f1" | endl;
     927        }
     928
     929        //Correct : block waiting for f1 or f2
     930        waitfor(f1, a) {
     931                sout | "f1" | endl;
     932        } or waitfor(f2, a) {
     933                sout | "f2" | endl;
     934        }
     935
     936        //Correct : non-blocking case
     937        waitfor(f1, a); or else;
     938
     939        //Correct : non-blocking case
     940        waitfor(f1, a) {
     941                sout | "blocked" | endl;
     942        } or else {
     943                sout | "didn't block" | endl;
     944        }
     945
     946        //Correct : block at most 10 seconds
     947        waitfor(f1, a) {
     948                sout | "blocked" | endl;
     949        } or timeout( 10`s) {
     950                sout | "didn't block" | endl;
     951        }
     952
     953        //Correct : block only if b == true
     954        //if b == false, don't even make the call
     955        when(b) waitfor(f1, a);
     956
     957        //Correct : block only if b == true
     958        //if b == false, make non-blocking call
     959        waitfor(f1, a); or when(!b) else;
     960
     961        //Correct : block only of t > 1
     962        waitfor(f1, a); or when(t > 1) timeout(t); or else;
     963
     964        //Incorrect : timeout clause is dead code
     965        waitfor(f1, a); or timeout(t); or else;
     966
     967        //Incorrect : order must be
     968        //waitfor [or waitfor... [or timeout] [or else]]
     969        timeout(t); or waitfor(f1, a); or else;
     970}
     971\end{cfacode}
     972\caption{Various correct and incorrect uses of the or, else, and timeout clause around a waitfor statement}
     973\label{lst:waitfor2}
     974\end{figure}
  • doc/proposals/concurrency/text/intro.tex

    rb96ec83 r6840e7c  
    33% ======================================================================
    44
    5 This proposal provides a minimal concurrency API that is simple, efficient and can be reused to build higher-level features. The simplest possible concurrency system is a thread and a lock but this low-level approach is hard to master. An easier approach for users is to support higher-level constructs as the basis of the concurrency, in \CFA. Indeed, for highly productive parallel programming, high-level approaches are much more popular~\cite{HPP:Study}. Examples are task based, message passing and implicit threading. Therefore a high-level approach is adapted in \CFA
     5This thesis provides a minimal concurrency \acrshort{api} that is simple, efficient and can be reused to build higher-level features. The simplest possible concurrency system is a thread and a lock but this low-level approach is hard to master. An easier approach for users is to support higher-level constructs as the basis of concurrency. Indeed, for highly productive concurrent programming, high-level approaches are much more popular~\cite{HPP:Study}. Examples are task based, message passing and implicit threading. The high-level approach and its minimal \acrshort{api} are tested in a dialect of C, call \CFA. [Is there value to say that this thesis is also an early definition of the \CFA language and library in regards to concurrency?]
    66
    7 There are actually two problems that need to be solved in the design of concurrency for a programming language: which concurrency and which parallelism tools are available to the users. While these two concepts are often combined, they are in fact distinct concepts that require different tools~\cite{Buhr05a}. Concurrency tools need to handle mutual exclusion and synchronization, while parallelism tools are about performance, cost and resource utilization.
     7There are actually two problems that need to be solved in the design of concurrency for a programming language: which concurrency and which parallelism tools are available to the programmer. While these two concepts are often combined, they are in fact distinct, requiring different tools~\cite{Buhr05a}. Concurrency tools need to handle mutual exclusion and synchronization, while parallelism tools are about performance, cost and resource utilization.
  • doc/proposals/concurrency/text/parallelism.tex

    rb96ec83 r6840e7c  
    1111\section{Paradigm}
    1212\subsection{User-level threads}
    13 A direct improvement on the \gls{kthread} approach is to use \glspl{uthread}. These threads offer most of the same features that the operating system already provide but can be used on a much larger scale. This approach is the most powerfull solution as it allows all the features of multi-threading, while removing several of the more expensives costs of using kernel threads. The down side is that almost none of the low-level threading problems are hidden, users still have to think about data races, deadlocks and synchronization issues. These issues can be somewhat alleviated by a concurrency toolkit with strong garantees but the parallelism toolkit offers very little to reduce complexity in itself.
     13A direct improvement on the \gls{kthread} approach is to use \glspl{uthread}. These threads offer most of the same features that the operating system already provide but can be used on a much larger scale. This approach is the most powerfull solution as it allows all the features of multi-threading, while removing several of the more expensive costs of kernel threads. The down side is that almost none of the low-level threading problems are hidden; users still have to think about data races, deadlocks and synchronization issues. These issues can be somewhat alleviated by a concurrency toolkit with strong garantees but the parallelism toolkit offers very little to reduce complexity in itself.
    1414
    1515Examples of languages that support \glspl{uthread} are Erlang~\cite{Erlang} and \uC~\cite{uC++book}.
    1616
    1717\subsection{Fibers : user-level threads without preemption}
    18 A popular varient of \glspl{uthread} is what is often reffered to as \glspl{fiber}. However, \glspl{fiber} do not present meaningful semantical differences with \glspl{uthread}. Advocates of \glspl{fiber} list their high performance and ease of implementation as majors strenghts of \glspl{fiber} but the performance difference between \glspl{uthread} and \glspl{fiber} is controversial and the ease of implementation, while true, is a weak argument in the context of language design. Therefore this proposal largely ignore fibers.
     18A popular varient of \glspl{uthread} is what is often refered to as \glspl{fiber}. However, \glspl{fiber} do not present meaningful semantical differences with \glspl{uthread}. Advocates of \glspl{fiber} list their high performance and ease of implementation as majors strenghts of \glspl{fiber} but the performance difference between \glspl{uthread} and \glspl{fiber} is controversial, and the ease of implementation, while true, is a weak argument in the context of language design. Therefore this proposal largely ignore fibers.
    1919
    2020An example of a language that uses fibers is Go~\cite{Go}
    2121
    2222\subsection{Jobs and thread pools}
    23 The approach on the opposite end of the spectrum is to base parallelism on \glspl{pool}. Indeed, \glspl{pool} offer limited flexibility but at the benefit of a simpler user interface. In \gls{pool} based systems, users express parallelism as units of work and a dependency graph (either explicit or implicit) that tie them together. This approach means users need not worry about concurrency but significantly limits the interaction that can occur among jobs. Indeed, any \gls{job} that blocks also blocks the underlying worker, which effectively means the CPU utilization, and therefore throughput, suffers noticeably. It can be argued that a solution to this problem is to use more workers than available cores. However, unless the number of jobs and the number of workers are comparable, having a significant amount of blocked jobs always results in idles cores.
     23An approach on the opposite end of the spectrum is to base parallelism on \glspl{pool}. Indeed, \glspl{pool} offer limited flexibility but at the benefit of a simpler user interface. In \gls{pool} based systems, users express parallelism as units of work, called jobs, and a dependency graph (either explicit or implicit) that tie them together. This approach means users need not worry about concurrency but significantly limit the interaction that can occur among jobs. Indeed, any \gls{job} that blocks also blocks the underlying worker, which effectively means the CPU utilization, and therefore throughput, suffers noticeably. It can be argued that a solution to this problem is to use more workers than available cores. However, unless the number of jobs and the number of workers are comparable, having a significant amount of blocked jobs always results in idles cores.
    2424
    2525The gold standard of this implementation is Intel's TBB library~\cite{TBB}.
    2626
    2727\subsection{Paradigm performance}
    28 While the choice between the three paradigms listed above may have significant performance implication, it is difficult to pindown the performance implications of chosing a model at the language level. Indeed, in many situations one of these paradigms may show better performance but it all strongly depends on the workload. Having a large amount of mostly independent units of work to execute almost guarantess that the \gls{pool} based system has the best performance thanks to the lower memory overhead. However, interactions between jobs can easily exacerbate contention. User-level threads allow fine-grain context switching, which results in better resource utilisation, but context switches will be more expansive and the extra control means users need to tweak more variables to get the desired performance. Furthermore, if the units of uninterrupted work are large enough the paradigm choice is largely amorticised by the actual work done.
     28While the choice between the three paradigms listed above may have significant performance implication, it is difficult to pindown the performance implications of chosing a model at the language level. Indeed, in many situations one of these paradigms may show better performance but it all strongly depends on the workload. Having a large amount of mostly independent units of work to execute almost guarantess that the \gls{pool} based system has the best performance thanks to the lower memory overhead (i.e., not thread stack per job). However, interactions among jobs can easily exacerbate contention. User-level threads allow fine-grain context switching, which results in better resource utilisation, but a context switch is more expensive and the extra control means users need to tweak more variables to get the desired performance. Finally, if the units of uninterrupted work are large enough the paradigm choice is largely amortised by the actual work done.
    2929
    30 \newpage
    3130\TODO
    32 \subsection{The \protect\CFA\ Kernel : Processors, Clusters and Threads}\label{kernel}
     31
     32\section{The \protect\CFA\ Kernel : Processors, Clusters and Threads}\label{kernel}
    3333
    3434
     35\subsection{Future Work: Machine setup}\label{machine}
     36While this was not done in the context of this thesis, another important aspect of clusters is affinity. While many common desktop and laptop PCs have homogeneous CPUs, other devices often have more heteregenous setups. For example, system using \acrshort{numa} configurations may benefit from users being able to tie clusters and/or kernel threads to certains CPU cores. OS support for CPU affinity is now common \cit, which means it is both possible and desirable for \CFA to offer an abstraction mechanism for portable CPU affinity.
     37
    3538\subsection{Paradigms}\label{cfaparadigms}
    36 Given these building blocks we can then reproduce the all three of the popular paradigms. Indeed, we get \glspl{uthread} as the default paradigm in \CFA. However, disabling \glspl{preemption} on the \gls{cfacluster} means \glspl{cfathread} effectively become \glspl{fiber}. Since several \glspl{cfacluster} with different scheduling policy can coexist in the same application, this allows \glspl{fiber} and \glspl{uthread} to coexist in the runtime of an application.
    37 
    38 % \subsection{High-level options}\label{tasks}
    39 %
    40 % \subsubsection{Thread interface}
    41 % constructors destructors
    42 %       initializer lists
    43 % monitors
    44 %
    45 % \subsubsection{Futures}
    46 %
    47 % \subsubsection{Implicit threading}
    48 % Finally, simpler applications can benefit greatly from having implicit parallelism. That is, parallelism that does not rely on the user to write concurrency. This type of parallelism can be achieved both at the language level and at the system level.
    49 %
    50 % \begin{center}
    51 % \begin{tabular}[t]{|c|c|c|}
    52 % Sequential & System Parallel & Language Parallel \\
    53 % \begin{lstlisting}
    54 % void big_sum(int* a, int* b,
    55 %                int* out,
    56 %                size_t length)
    57 % {
    58 %       for(int i = 0; i < length; ++i ) {
    59 %               out[i] = a[i] + b[i];
    60 %       }
    61 % }
    62 %
    63 %
    64 %
    65 %
    66 %
    67 % int* a[10000];
    68 % int* b[10000];
    69 % int* c[10000];
    70 % //... fill in a and b ...
    71 % big_sum(a, b, c, 10000);
    72 % \end{lstlisting} &\begin{lstlisting}
    73 % void big_sum(int* a, int* b,
    74 %                int* out,
    75 %                size_t length)
    76 % {
    77 %       range ar(a, a + length);
    78 %       range br(b, b + length);
    79 %       range or(out, out + length);
    80 %       parfor( ai, bi, oi,
    81 %       [](int* ai, int* bi, int* oi) {
    82 %               oi = ai + bi;
    83 %       });
    84 % }
    85 %
    86 % int* a[10000];
    87 % int* b[10000];
    88 % int* c[10000];
    89 % //... fill in a and b ...
    90 % big_sum(a, b, c, 10000);
    91 % \end{lstlisting}&\begin{lstlisting}
    92 % void big_sum(int* a, int* b,
    93 %                int* out,
    94 %                size_t length)
    95 % {
    96 %       for (ai, bi, oi) in (a, b, out) {
    97 %               oi = ai + bi;
    98 %       }
    99 % }
    100 %
    101 %
    102 %
    103 %
    104 %
    105 % int* a[10000];
    106 % int* b[10000];
    107 % int* c[10000];
    108 % //... fill in a and b ...
    109 % big_sum(a, b, c, 10000);
    110 % \end{lstlisting}
    111 % \end{tabular}
    112 % \end{center}
    113 %
    114 % \subsection{Machine setup}\label{machine}
    115 % Threads are all good and well but wee still some OS support to fully utilize available hardware.
    116 %
    117 % \textbf{\large{Work in progress...}} Do wee need something beyond specifying the number of kernel threads?
     39Given these building blocks, it is possible to reproduce all three of the popular paradigms. Indeed, \glspl{uthread} is the default paradigm in \CFA. However, disabling \glspl{preemption} on the \gls{cfacluster} means \glspl{cfathread} effectively become \glspl{fiber}. Since several \glspl{cfacluster} with different scheduling policy can coexist in the same application, this allows \glspl{fiber} and \glspl{uthread} to coexist in the runtime of an application. Finally, it is possible to build executors for thread pools from \glspl{uthread} or \glspl{fiber}.
  • doc/proposals/concurrency/thesis.tex

    rb96ec83 r6840e7c  
    11% requires tex packages: texlive-base texlive-latex-base tex-common texlive-humanities texlive-latex-extra texlive-fonts-recommended
    22
    3 % inline code ©...© (copyright symbol) emacs: C-q M-)
    4 % red highlighting ®...® (registered trademark symbol) emacs: C-q M-.
    5 % blue highlighting ß...ß (sharp s symbol) emacs: C-q M-_
    6 % green highlighting ¢...¢ (cent symbol) emacs: C-q M-"
    7 % LaTex escape §...§ (section symbol) emacs: C-q M-'
    8 % keyword escape ¶...¶ (pilcrow symbol) emacs: C-q M-^
     3% inline code �...� (copyright symbol) emacs: C-q M-)
     4% red highlighting �...� (registered trademark symbol) emacs: C-q M-.
     5% blue highlighting �...� (sharp s symbol) emacs: C-q M-_
     6% green highlighting �...� (cent symbol) emacs: C-q M-"
     7% LaTex escape �...� (section symbol) emacs: C-q M-'
     8% keyword escape �...� (pilcrow symbol) emacs: C-q M-^
    99% math escape $...$ (dollar symbol)
    1010
     
    2727\usepackage{multicol}
    2828\usepackage[acronym]{glossaries}
    29 \usepackage{varioref}   
     29\usepackage{varioref}
    3030\usepackage{listings}                                           % format program code
    3131\usepackage[flushmargin]{footmisc}                              % support label/reference in footnote
     
    7070%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    7171
    72 \setcounter{secnumdepth}{3}                           % number subsubsections
    73 \setcounter{tocdepth}{3}                              % subsubsections in table of contents
     72\setcounter{secnumdepth}{2}                           % number subsubsections
     73\setcounter{tocdepth}{2}                              % subsubsections in table of contents
    7474% \linenumbers                                          % comment out to turn off line numbering
    7575\makeindex
     
    103103\input{parallelism}
    104104
    105 \chapter{Putting it all together}
     105\input{internals}
     106
     107\input{together}
     108
     109\input{future}
    106110
    107111\chapter{Conclusion}
    108 
    109 \chapter{Future work}
    110 Concurrency and parallelism is still a very active field that strongly benefits from hardware advances. As such certain features that aren't necessarily mature enough in their current state could become relevant in the lifetime of \CFA.
    111 \subsection{Transactions}
    112112
    113113\section*{Acknowledgements}
  • doc/proposals/concurrency/version

    rb96ec83 r6840e7c  
    1 0.9.180
     10.10.212
  • src/CodeGen/CodeGenerator.cc

    rb96ec83 r6840e7c  
    287287        void CodeGenerator::postvisit( TypeDecl * typeDecl ) {
    288288                assertf( ! genC, "TypeDecls should not reach code generation." );
    289                 output << typeDecl->genTypeString() << " " << typeDecl->get_name();
    290                 if ( typeDecl->get_kind() != TypeDecl::Any && typeDecl->get_sized() ) {
    291                         output << " | sized(" << typeDecl->get_name() << ")";
    292                 }
    293                 if ( ! typeDecl->get_assertions().empty() ) {
     289                output << typeDecl->genTypeString() << " " << typeDecl->name;
     290                if ( typeDecl->get_kind() != TypeDecl::Any && typeDecl->sized ) {
     291                        output << " | sized(" << typeDecl->name << ")";
     292                }
     293                if ( ! typeDecl->assertions.empty() ) {
    294294                        output << " | { ";
    295                         genCommaList( typeDecl->get_assertions().begin(), typeDecl->get_assertions().end() );
     295                        for ( DeclarationWithType * assert :  typeDecl->assertions ) {
     296                                assert->accept( *visitor );
     297                                output << "; ";
     298                        }
    296299                        output << " }";
    297300                }
     
    946949                output << ";";
    947950        }
     951        void CodeGenerator::postvisit( CatchStmt * stmt ) {
     952                assertf( ! genC, "Catch statements should not reach code generation." );
     953
     954                output << ((stmt->get_kind() == CatchStmt::Terminate) ?
     955                "catch" : "catchResume");
     956                output << "( ";
     957                stmt->decl->accept( *visitor );
     958                output << " ) ";
     959
     960                if( stmt->cond ) {
     961                        output << "if/when(?) (";
     962                        stmt->cond->accept( *visitor );
     963                        output << ") ";
     964                }
     965                stmt->body->accept( *visitor );
     966        }
     967
     968        void CodeGenerator::postvisit( WaitForStmt * stmt ) {
     969                assertf( ! genC, "Waitfor statements should not reach code generation." );
     970
     971                bool first = true;
     972                for( auto & clause : stmt->clauses ) {
     973                        if(first) { output << "or "; first = false; }
     974                        if( clause.condition ) {
     975                                output << "when(";
     976                                stmt->timeout.condition->accept( *visitor );
     977                                output << ") ";
     978                        }
     979                        output << "waitfor(";
     980                        clause.target.function->accept( *visitor );
     981                        for( Expression * expr : clause.target.arguments ) {
     982                                output << ",";
     983                                expr->accept( *visitor );
     984                        }
     985                        output << ") ";
     986                        clause.statement->accept( *visitor );
     987                }
     988
     989                if( stmt->timeout.statement ) {
     990                        output << "or ";
     991                        if( stmt->timeout.condition ) {
     992                                output << "when(";
     993                                stmt->timeout.condition->accept( *visitor );
     994                                output << ") ";
     995                        }
     996                        output << "timeout(";
     997                        stmt->timeout.time->accept( *visitor );
     998                        output << ") ";
     999                        stmt->timeout.statement->accept( *visitor );
     1000                }
     1001
     1002                if( stmt->orelse.statement ) {
     1003                        output << "or ";
     1004                        if( stmt->orelse.condition ) {
     1005                                output << "when(";
     1006                                stmt->orelse.condition->accept( *visitor );
     1007                                output << ")";
     1008                        }
     1009                        output << "else ";
     1010                        stmt->orelse.statement->accept( *visitor );
     1011                }
     1012        }
     1013
    9481014
    9491015        void CodeGenerator::postvisit( WhileStmt * whileStmt ) {
     
    10241090        }
    10251091} // namespace CodeGen
     1092
     1093
     1094unsigned Indenter::tabsize = 2;
    10261095
    10271096std::ostream & operator<<( std::ostream & out, const BaseSyntaxNode * node ) {
  • src/CodeGen/CodeGenerator.h

    rb96ec83 r6840e7c  
    100100                void postvisit( ReturnStmt * );
    101101                void postvisit( ThrowStmt * );
     102                void postvisit( CatchStmt * );
     103                void postvisit( WaitForStmt * );
    102104                void postvisit( WhileStmt * );
    103105                void postvisit( ForStmt * );
  • src/CodeGen/FixNames.cc

    rb96ec83 r6840e7c  
    6666                );
    6767
    68                 mainDecl->get_functionType()->get_parameters().push_back(
     68                main_type->get_parameters().push_back(
    6969                        new ObjectDecl( "", Type::StorageClasses(), LinkageSpec::Cforall, 0, new BasicType( Type::Qualifiers(), BasicType::SignedInt ), nullptr )
    7070                );
    7171
    72                 mainDecl->get_functionType()->get_parameters().push_back(
     72                main_type->get_parameters().push_back(
    7373                        new ObjectDecl( "", Type::StorageClasses(), LinkageSpec::Cforall, 0,
    7474                        new PointerType( Type::Qualifiers(), new PointerType( Type::Qualifiers(), new BasicType( Type::Qualifiers(), BasicType::Char ) ) ),
  • src/CodeGen/GenType.cc

    rb96ec83 r6840e7c  
    210210
    211211        std::string GenType::handleGeneric( ReferenceToType * refType ) {
    212                 if ( ! refType->get_parameters().empty() ) {
     212                if ( ! refType->parameters.empty() ) {
    213213                        std::ostringstream os;
    214214                        PassVisitor<CodeGenerator> cg( os, pretty, genC, lineMarks );
    215215                        os << "(";
    216                         cg.pass.genCommaList( refType->get_parameters().begin(), refType->get_parameters().end() );
     216                        cg.pass.genCommaList( refType->parameters.begin(), refType->parameters.end() );
    217217                        os << ") ";
    218218                        return os.str();
  • src/Common/Indenter.h

    rb96ec83 r6840e7c  
    1818
    1919struct Indenter {
    20         Indenter( unsigned int amt = 2 ) : amt( amt ) {}
    21         unsigned int amt = 2;  // amount 1 level increases indent by (i.e. how much to increase by in operator++)
    22         unsigned int indent = 0;
     20        static unsigned tabsize;
     21
     22        Indenter( unsigned int amt = tabsize, unsigned int indent = 0 ) : amt( amt ), indent( indent ) {}
     23        unsigned int amt;  // amount 1 level increases indent by (i.e. how much to increase by in operator++)
     24        unsigned int indent;
    2325
    2426        Indenter & operator+=(int nlevels) { indent += amt*nlevels; return *this; }
     
    3032};
    3133
    32 inline std::ostream & operator<<( std::ostream & out, Indenter & indent ) {
     34inline std::ostream & operator<<( std::ostream & out, const Indenter & indent ) {
    3335        return out << std::string(indent.indent, ' ');
    3436}
  • src/Common/PassVisitor.h

    rb96ec83 r6840e7c  
    44
    55#include <stack>
     6
     7#include "Common/utility.h"
    68
    79#include "SynTree/Mutator.h"
     
    236238        virtual Attribute * mutate( Attribute * attribute ) override final;
    237239
     240        virtual TypeSubstitution * mutate( TypeSubstitution * sub ) final;
     241
    238242private:
    239243        template<typename pass_t> friend void acceptAll( std::list< Declaration* > &decls, PassVisitor< pass_t >& visitor );
    240244        template<typename pass_t> friend void mutateAll( std::list< Declaration* > &decls, PassVisitor< pass_t >& visitor );
     245        template< typename TreeType, typename pass_t > friend void maybeAccept_impl( TreeType * tree, PassVisitor< pass_t > & visitor );
     246        template< typename TreeType, typename pass_t > friend void maybeMutate_impl( TreeType *& tree, PassVisitor< pass_t > & mutator );
     247        template< typename Container, typename pass_t > friend void maybeAccept_impl( Container & container, PassVisitor< pass_t > & visitor );
     248        template< typename Container, typename pass_t > friend void maybeMutate_impl( Container & container, PassVisitor< pass_t > & mutator );
    241249
    242250        template<typename node_type> void call_previsit ( node_type * node ) { previsit_impl ( pass, node, 0 ); }
     
    273281        std::list< Declaration* > *     get_afterDecls () { return declsToAddAfter_impl ( pass, 0); }
    274282
    275         void set_visit_children( bool& ref ) { bool_ref * ptr = visit_children_impl(pass, 0); if(ptr) ptr->set( ref ); }
     283        bool       get_visit_children    () { bool_ref * ptr = visit_children_impl(pass, 0); return ptr ? *ptr : true; }
     284        bool_ref * get_visit_children_ptr() { return visit_children_impl(pass, 0); }
    276285
    277286        void indexerScopeEnter  ()                             { indexer_impl_enterScope  ( pass, 0       ); }
  • src/Common/PassVisitor.impl.h

    rb96ec83 r6840e7c  
    22// IWYU pragma: private, include "PassVisitor.h"
    33
    4 #define VISIT_START( node )                     \
    5         __attribute__((unused))                   \
     4#define VISIT_START( node )                                     \
     5        __attribute__((unused))                                   \
     6        ChildrenGuard children_guard( get_visit_children_ptr() ); \
     7        __attribute__((unused))                                   \
    68        guard_value_impl guard( at_cleanup_impl(pass, 0) );       \
    7         bool visit_children = true;               \
    8         set_visit_children( visit_children );   \
    9         call_previsit( node );                    \
    10         if( visit_children ) {                    \
     9        call_previsit( node );                                    \
    1110
    1211#define VISIT_END( node )                       \
    13         }                                         \
    1412        call_postvisit( node );                   \
    1513
    16 #define MUTATE_START( node )                    \
    17         __attribute__((unused))                   \
     14#define MUTATE_START( node )                                    \
     15        __attribute__((unused))                                   \
     16        ChildrenGuard children_guard( get_visit_children_ptr() ); \
     17        __attribute__((unused))                                   \
    1818        guard_value_impl guard( at_cleanup_impl(pass, 0) );       \
    19         bool visit_children = true;               \
    20         set_visit_children( visit_children );   \
    21         call_premutate( node );                   \
    22         if( visit_children ) {                    \
     19        call_premutate( node );                                   \
    2320
    2421#define MUTATE_END( type, node )                \
    25         }                                         \
    2622        return call_postmutate< type * >( node ); \
    2723
    2824
    29 #define VISIT_BODY( node )        \
    30         VISIT_START( node );        \
    31         Visitor::visit( node );     \
    32         VISIT_END( node );          \
    33 
    34 
    35 #define MUTATE_BODY( type, node ) \
    36         MUTATE_START( node );       \
    37         Mutator::mutate( node );    \
    38         MUTATE_END( type, node );   \
     25#define VISIT_BODY( node )          \
     26        VISIT_START( node );          \
     27        if( children_guard ) {        \
     28                Visitor::visit( node ); \
     29        }                             \
     30        VISIT_END( node );            \
     31
     32
     33#define MUTATE_BODY( type, node )    \
     34        MUTATE_START( node );          \
     35        if( children_guard ) {         \
     36                Mutator::mutate( node ); \
     37        }                              \
     38        MUTATE_END( type, node );      \
    3939
    4040
     
    6363template< typename pass_type >
    6464static inline void acceptAll( std::list< Declaration* > &decls, PassVisitor< pass_type >& visitor ) {
    65 
    6665        DeclList_t* beforeDecls = visitor.get_beforeDecls();
    6766        DeclList_t* afterDecls  = visitor.get_afterDecls();
     
    7675                try {
    7776                        // run visitor on declaration
    78                         maybeAccept( *i, visitor );
     77                        maybeAccept_impl( *i, visitor );
    7978                } catch( SemanticError &e ) {
    8079                        e.set_location( (*i)->location );
     
    9291template< typename pass_type >
    9392static inline void mutateAll( std::list< Declaration* > &decls, PassVisitor< pass_type >& mutator ) {
    94 
    9593        DeclList_t* beforeDecls = mutator.get_beforeDecls();
    9694        DeclList_t* afterDecls  = mutator.get_afterDecls();
     
    104102                try {
    105103                        // run mutator on declaration
    106                         *i = maybeMutate( *i, mutator );
     104                        maybeMutate_impl( *i, mutator );
    107105                } catch( SemanticError &e ) {
    108106                        e.set_location( (*i)->location );
     
    118116}
    119117
    120 template< typename Container, typename VisitorType >
    121 inline void maybeAccept( Container &container, VisitorType &visitor ) {
     118template< typename TreeType, typename pass_type >
     119inline void maybeAccept_impl( TreeType * tree, PassVisitor< pass_type > & visitor ) {
     120        if ( ! visitor.get_visit_children() ) return;
     121        if ( tree ) {
     122                tree->accept( visitor );
     123        }
     124}
     125
     126template< typename Container, typename pass_type >
     127inline void maybeAccept_impl( Container & container, PassVisitor< pass_type > & visitor ) {
     128        if ( ! visitor.get_visit_children() ) return;
    122129        SemanticError errors;
    123130        for ( typename Container::iterator i = container.begin(); i != container.end(); ++i ) {
     
    136143}
    137144
    138 template< typename Container, typename MutatorType >
    139 inline void maybeMutateRef( Container &container, MutatorType &mutator ) {
     145template< typename TreeType, typename pass_type >
     146inline void maybeMutate_impl( TreeType *& tree, PassVisitor< pass_type > & mutator ) {
     147        if ( ! mutator.get_visit_children() ) return;
     148
     149        if ( tree ) {
     150                tree = strict_dynamic_cast< TreeType * >( tree->acceptMutator( mutator ) );
     151        }
     152}
     153
     154template< typename Container, typename pass_type >
     155inline void maybeMutate_impl( Container & container, PassVisitor< pass_type > & mutator ) {
     156        if ( ! mutator.get_visit_children() ) return;
    140157        SemanticError errors;
    141158        for ( typename Container::iterator i = container.begin(); i != container.end(); ++i ) {
    142159                try {
    143160                        if ( *i ) {
    144 ///                 *i = (*i)->acceptMutator( mutator );
    145161                                *i = dynamic_cast< typename Container::value_type >( (*i)->acceptMutator( mutator ) );
    146162                                assert( *i );
     
    159175template< typename func_t >
    160176void PassVisitor< pass_type >::handleStatementList( std::list< Statement * > & statements, func_t func ) {
     177        if ( ! get_visit_children() ) return;
    161178        SemanticError errors;
    162179
     
    199216void PassVisitor< pass_type >::visitStatementList( std::list< Statement * > & statements ) {
    200217        handleStatementList( statements, [this]( Statement * stmt) {
    201                 stmt->accept( *this );
     218                maybeAccept_impl( stmt, *this );
    202219        });
    203220}
     
    206223void PassVisitor< pass_type >::mutateStatementList( std::list< Statement * > & statements ) {
    207224        handleStatementList( statements, [this]( Statement *& stmt) {
    208                 stmt = stmt->acceptMutator( *this );
     225                maybeMutate_impl( stmt, *this );
    209226        });
    210227}
     
    214231template< typename func_t >
    215232Statement * PassVisitor< pass_type >::handleStatement( Statement * stmt, func_t func ) {
     233        if ( ! get_visit_children() ) return stmt;
     234
    216235        // don't want statements from outer CompoundStmts to be added to this CompoundStmt
    217236        ValueGuardPtr< TypeSubstitution * >  oldEnv        ( get_env_ptr    () );
     
    244263Statement * PassVisitor< pass_type >::visitStatement( Statement * stmt ) {
    245264        return handleStatement( stmt, [this]( Statement * stmt ) {
    246                 maybeAccept( stmt, *this );
     265                maybeAccept_impl( stmt, *this );
    247266                return stmt;
    248267        });
     
    252271Statement * PassVisitor< pass_type >::mutateStatement( Statement * stmt ) {
    253272        return handleStatement( stmt, [this]( Statement * stmt ) {
    254                 return maybeMutate( stmt, *this );
     273                maybeMutate_impl( stmt, *this );
     274                return stmt;
    255275        });
    256276}
     
    259279template< typename func_t >
    260280Expression * PassVisitor< pass_type >::handleExpression( Expression * expr, func_t func ) {
     281        if ( ! get_visit_children() ) return expr;
    261282        if( !expr ) return nullptr;
    262283
     
    266287        }
    267288
    268         // should env be cloned (or moved) onto the result of the mutate?
     289        // should env be moved onto the result of the mutate?
    269290        return func( expr );
    270291}
     
    273294Expression * PassVisitor< pass_type >::visitExpression( Expression * expr ) {
    274295        return handleExpression(expr, [this]( Expression * expr ) {
    275                 expr->accept( *this );
     296                maybeAccept_impl( expr, *this );
    276297                return expr;
    277298        });
     
    281302Expression * PassVisitor< pass_type >::mutateExpression( Expression * expr ) {
    282303        return handleExpression(expr, [this]( Expression * expr ) {
    283                 return expr->acceptMutator( *this );
     304                maybeMutate_impl( expr, *this );
     305                return expr;
    284306        });
     307}
     308
     309template< typename TreeType, typename VisitorType >
     310inline void indexerScopedAccept( TreeType * tree, VisitorType & visitor ) {
     311        if ( ! visitor.get_visit_children() ) return;
     312        auto guard = makeFuncGuard(
     313                [&visitor]() { visitor.indexerScopeEnter(); },
     314                [&visitor]() { visitor.indexerScopeLeave(); }
     315        );
     316        maybeAccept_impl( tree, visitor );
     317}
     318
     319template< typename TreeType, typename MutatorType >
     320inline void indexerScopedMutate( TreeType *& tree, MutatorType & mutator ) {
     321        if ( ! mutator.get_visit_children() ) return;
     322        auto guard = makeFuncGuard(
     323                [&mutator]() { mutator.indexerScopeEnter(); },
     324                [&mutator]() { mutator.indexerScopeLeave(); }
     325        );
     326        maybeMutate_impl( tree, mutator );
    285327}
    286328
     
    319361
    320362        indexerScopedAccept( node->type         , *this );
    321         maybeAccept        ( node->init         , *this );
    322         maybeAccept        ( node->bitfieldWidth, *this );
    323         maybeAccept        ( node->attributes   , *this );
     363        maybeAccept_impl   ( node->init         , *this );
     364        maybeAccept_impl   ( node->bitfieldWidth, *this );
     365        maybeAccept_impl   ( node->attributes   , *this );
    324366
    325367        if ( node->name != "" ) {
     
    335377
    336378        indexerScopedMutate( node->type         , *this );
    337         maybeMutateRef     ( node->init         , *this );
    338         maybeMutateRef     ( node->bitfieldWidth, *this );
    339         maybeMutateRef     ( node->attributes   , *this );
     379        maybeMutate_impl   ( node->init         , *this );
     380        maybeMutate_impl   ( node->bitfieldWidth, *this );
     381        maybeMutate_impl   ( node->attributes   , *this );
    340382
    341383        if ( node->name != "" ) {
     
    358400        {
    359401                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    360                 maybeAccept( node->type, *this );
    361                 maybeAccept( node->statements, *this );
    362                 maybeAccept( node->attributes, *this );
     402                maybeAccept_impl( node->type, *this );
     403                maybeAccept_impl( node->statements, *this );
     404                maybeAccept_impl( node->attributes, *this );
    363405        }
    364406
     
    376418        {
    377419                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    378                 maybeMutateRef( node->type, *this );
    379                 maybeMutateRef( node->statements, *this );
    380                 maybeMutateRef( node->attributes, *this );
     420                maybeMutate_impl( node->type, *this );
     421                maybeMutate_impl( node->statements, *this );
     422                maybeMutate_impl( node->attributes, *this );
    381423        }
    382424
     
    396438        {
    397439                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    398                 maybeAccept( node->parameters, *this );
    399                 maybeAccept( node->members   , *this );
     440                maybeAccept_impl( node->parameters, *this );
     441                maybeAccept_impl( node->members   , *this );
    400442        }
    401443
     
    416458        {
    417459                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    418                 maybeMutateRef( node->parameters, *this );
    419                 maybeMutateRef( node->members   , *this );
     460                maybeMutate_impl( node->parameters, *this );
     461                maybeMutate_impl( node->members   , *this );
    420462        }
    421463
     
    437479        {
    438480                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    439                 maybeAccept( node->parameters, *this );
    440                 maybeAccept( node->members   , *this );
     481                maybeAccept_impl( node->parameters, *this );
     482                maybeAccept_impl( node->members   , *this );
    441483        }
    442484
     
    455497        {
    456498                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    457                 maybeMutateRef( node->parameters, *this );
    458                 maybeMutateRef( node->members   , *this );
     499                maybeMutate_impl( node->parameters, *this );
     500                maybeMutate_impl( node->members   , *this );
    459501        }
    460502
     
    473515
    474516        // unlike structs, traits, and unions, enums inject their members into the global scope
    475         maybeAccept( node->parameters, *this );
    476         maybeAccept( node->members   , *this );
     517        maybeAccept_impl( node->parameters, *this );
     518        maybeAccept_impl( node->members   , *this );
    477519
    478520        VISIT_END( node );
     
    486528
    487529        // unlike structs, traits, and unions, enums inject their members into the global scope
    488         maybeMutateRef( node->parameters, *this );
    489         maybeMutateRef( node->members   , *this );
     530        maybeMutate_impl( node->parameters, *this );
     531        maybeMutate_impl( node->members   , *this );
    490532
    491533        MUTATE_END( Declaration, node );
     
    500542        {
    501543                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    502                 maybeAccept( node->parameters, *this );
    503                 maybeAccept( node->members   , *this );
     544                maybeAccept_impl( node->parameters, *this );
     545                maybeAccept_impl( node->members   , *this );
    504546        }
    505547
     
    515557        {
    516558                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    517                 maybeMutateRef( node->parameters, *this );
    518                 maybeMutateRef( node->members   , *this );
     559                maybeMutate_impl( node->parameters, *this );
     560                maybeMutate_impl( node->members   , *this );
    519561        }
    520562
     
    532574        {
    533575                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    534                 maybeAccept( node->parameters, *this );
    535                 maybeAccept( node->base      , *this );
     576                maybeAccept_impl( node->parameters, *this );
     577                maybeAccept_impl( node->base      , *this );
    536578        }
    537579
     
    541583        indexerAddType( node );
    542584
    543         maybeAccept( node->assertions, *this );
     585        maybeAccept_impl( node->assertions, *this );
    544586
    545587        indexerScopedAccept( node->init, *this );
     
    554596        {
    555597                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    556                 maybeMutateRef( node->parameters, *this );
    557                 maybeMutateRef( node->base      , *this );
     598                maybeMutate_impl( node->parameters, *this );
     599                maybeMutate_impl( node->base      , *this );
    558600        }
    559601
     
    563605        indexerAddType( node );
    564606
    565         maybeMutateRef( node->assertions, *this );
     607        maybeMutate_impl( node->assertions, *this );
    566608
    567609        indexerScopedMutate( node->init, *this );
     
    578620        {
    579621                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    580                 maybeAccept( node->parameters, *this );
    581                 maybeAccept( node->base      , *this );
     622                maybeAccept_impl( node->parameters, *this );
     623                maybeAccept_impl( node->base      , *this );
    582624        }
    583625
    584626        indexerAddType( node );
    585627
    586         maybeAccept( node->assertions, *this );
     628        maybeAccept_impl( node->assertions, *this );
    587629
    588630        VISIT_END( node );
     
    595637        {
    596638                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    597                 maybeMutateRef     ( node->parameters, *this );
    598                 maybeMutateRef( node->base      , *this );
     639                maybeMutate_impl( node->parameters, *this );
     640                maybeMutate_impl( node->base      , *this );
    599641        }
    600642
    601643        indexerAddType( node );
    602644
    603         maybeMutateRef( node->assertions, *this );
     645        maybeMutate_impl( node->assertions, *this );
    604646
    605647        MUTATE_END( Declaration, node );
     
    612654        VISIT_START( node );
    613655
    614         maybeAccept( node->stmt, *this );
     656        maybeAccept_impl( node->stmt, *this );
    615657
    616658        VISIT_END( node );
     
    621663        MUTATE_START( node );
    622664
    623         maybeMutateRef( node->stmt, *this );
     665        maybeMutate_impl( node->stmt, *this );
    624666
    625667        MUTATE_END( AsmDecl, node );
     
    690732                // if statements introduce a level of scope (for the initialization)
    691733                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    692                 acceptAll( node->get_initialization(), *this );
    693                 visitExpression( node->condition );
     734                maybeAccept_impl( node->get_initialization(), *this );
     735                visitExpression ( node->condition );
    694736                node->thenPart = visitStatement( node->thenPart );
    695737                node->elsePart = visitStatement( node->elsePart );
     
    704746                // if statements introduce a level of scope (for the initialization)
    705747                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    706                 maybeMutateRef( node->get_initialization(), *this );
     748                maybeMutate_impl( node->get_initialization(), *this );
    707749                node->condition = mutateExpression( node->condition );
    708750                node->thenPart  = mutateStatement ( node->thenPart  );
     
    742784                // for statements introduce a level of scope (for the initialization)
    743785                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    744                 maybeAccept( node->initialization, *this );
     786                maybeAccept_impl( node->initialization, *this );
    745787                visitExpression( node->condition );
    746788                visitExpression( node->increment );
     
    756798                // for statements introduce a level of scope (for the initialization)
    757799                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    758                 maybeMutateRef( node->initialization, *this );
     800                maybeMutate_impl( node->initialization, *this );
    759801                node->condition = mutateExpression( node->condition );
    760802                node->increment = mutateExpression( node->increment );
     
    859901        VISIT_START( node );
    860902
    861         maybeAccept( node->block       , *this );
    862         maybeAccept( node->handlers    , *this );
    863         maybeAccept( node->finallyBlock, *this );
     903        maybeAccept_impl( node->block       , *this );
     904        maybeAccept_impl( node->handlers    , *this );
     905        maybeAccept_impl( node->finallyBlock, *this );
    864906
    865907        VISIT_END( node );
     
    870912        MUTATE_START( node );
    871913
    872         maybeMutateRef( node->block       , *this );
    873         maybeMutateRef( node->handlers    , *this );
    874         maybeMutateRef( node->finallyBlock, *this );
     914        maybeMutate_impl( node->block       , *this );
     915        maybeMutate_impl( node->handlers    , *this );
     916        maybeMutate_impl( node->finallyBlock, *this );
    875917
    876918        MUTATE_END( Statement, node );
     
    885927                // catch statements introduce a level of scope (for the caught exception)
    886928                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    887                 maybeAccept( node->decl, *this );
     929                maybeAccept_impl( node->decl, *this );
    888930                node->cond = visitExpression( node->cond );
    889931                node->body = visitStatement ( node->body );
     
    898940                // catch statements introduce a level of scope (for the caught exception)
    899941                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    900                 maybeMutateRef( node->decl, *this );
     942                maybeMutate_impl( node->decl, *this );
    901943                node->cond = mutateExpression( node->cond );
    902944                node->body = mutateStatement ( node->body );
     
    9721014
    9731015        indexerScopedAccept( node->result  , *this );
    974         maybeAccept        ( node->function, *this );
    975         maybeAccept        ( node->args    , *this );
     1016        maybeAccept_impl        ( node->function, *this );
     1017        maybeAccept_impl        ( node->args    , *this );
    9761018
    9771019        VISIT_END( node );
     
    9841026        indexerScopedMutate( node->env     , *this );
    9851027        indexerScopedMutate( node->result  , *this );
    986         maybeMutateRef     ( node->function, *this );
    987         maybeMutateRef     ( node->args    , *this );
     1028        maybeMutate_impl   ( node->function, *this );
     1029        maybeMutate_impl   ( node->args    , *this );
    9881030
    9891031        MUTATE_END( Expression, node );
     
    9961038        VISIT_START( node );
    9971039
    998         // maybeAccept( node->get_env(), *this );
     1040        // maybeAccept_impl( node->get_env(), *this );
    9991041        indexerScopedAccept( node->result, *this );
    10001042
     
    10481090
    10491091        indexerScopedAccept( node->result, *this );
    1050         maybeAccept        ( node->arg   , *this );
     1092        maybeAccept_impl        ( node->arg   , *this );
    10511093
    10521094        VISIT_END( node );
     
    10591101        indexerScopedMutate( node->env   , *this );
    10601102        indexerScopedMutate( node->result, *this );
    1061         maybeMutateRef     ( node->arg   , *this );
     1103        maybeMutate_impl   ( node->arg   , *this );
    10621104
    10631105        MUTATE_END( Expression, node );
     
    10711113
    10721114        indexerScopedAccept( node->result, *this );
    1073         maybeAccept( node->arg, *this );
     1115        maybeAccept_impl( node->arg, *this );
    10741116
    10751117        VISIT_END( node );
     
    10821124        indexerScopedMutate( node->env   , *this );
    10831125        indexerScopedMutate( node->result, *this );
    1084         maybeMutateRef     ( node->arg   , *this );
     1126        maybeMutate_impl   ( node->arg   , *this );
    10851127
    10861128        MUTATE_END( Expression, node );
     
    10941136
    10951137        indexerScopedAccept( node->result, *this );
    1096         maybeAccept        ( node->arg   , *this );
     1138        maybeAccept_impl   ( node->arg   , *this );
    10971139
    10981140        VISIT_END( node );
     
    11051147        indexerScopedMutate( node->env   , *this );
    11061148        indexerScopedMutate( node->result, *this );
    1107         maybeMutateRef     ( node->arg   , *this );
     1149        maybeMutate_impl   ( node->arg   , *this );
    11081150
    11091151        MUTATE_END( Expression, node );
     
    11381180
    11391181        indexerScopedAccept( node->result   , *this );
    1140         maybeAccept        ( node->aggregate, *this );
    1141         maybeAccept        ( node->member   , *this );
     1182        maybeAccept_impl   ( node->aggregate, *this );
     1183        maybeAccept_impl   ( node->member   , *this );
    11421184
    11431185        VISIT_END( node );
     
    11501192        indexerScopedMutate( node->env      , *this );
    11511193        indexerScopedMutate( node->result   , *this );
    1152         maybeMutateRef     ( node->aggregate, *this );
    1153         maybeMutateRef     ( node->member   , *this );
     1194        maybeMutate_impl   ( node->aggregate, *this );
     1195        maybeMutate_impl   ( node->member   , *this );
    11541196
    11551197        MUTATE_END( Expression, node );
     
    11631205
    11641206        indexerScopedAccept( node->result   , *this );
    1165         maybeAccept        ( node->aggregate, *this );
     1207        maybeAccept_impl   ( node->aggregate, *this );
    11661208
    11671209        VISIT_END( node );
     
    11741216        indexerScopedMutate( node->env      , *this );
    11751217        indexerScopedMutate( node->result   , *this );
    1176         maybeMutateRef     ( node->aggregate, *this );
     1218        maybeMutate_impl   ( node->aggregate, *this );
    11771219
    11781220        MUTATE_END( Expression, node );
     
    12071249
    12081250        indexerScopedAccept( node->result   , *this );
    1209         maybeAccept        ( &node->constant, *this );
     1251        maybeAccept_impl   ( &node->constant, *this );
    12101252
    12111253        VISIT_END( node );
     
    12181260        indexerScopedMutate( node->env   , *this );
    12191261        indexerScopedMutate( node->result, *this );
    1220         node->constant = *maybeMutate( &node->constant, *this );
     1262        Constant * ptr = &node->constant;
     1263        maybeMutate_impl( ptr, *this );
     1264        node->constant = *ptr;
    12211265
    12221266        MUTATE_END( Expression, node );
     
    12311275        indexerScopedAccept( node->result, *this );
    12321276        if ( node->get_isType() ) {
    1233                 maybeAccept( node->type, *this );
     1277                maybeAccept_impl( node->type, *this );
    12341278        } else {
    1235                 maybeAccept( node->expr, *this );
     1279                maybeAccept_impl( node->expr, *this );
    12361280        }
    12371281
     
    12461290        indexerScopedMutate( node->result, *this );
    12471291        if ( node->get_isType() ) {
    1248                 maybeMutateRef( node->type, *this );
     1292                maybeMutate_impl( node->type, *this );
    12491293        } else {
    1250                 maybeMutateRef( node->expr, *this );
     1294                maybeMutate_impl( node->expr, *this );
    12511295        }
    12521296
     
    12621306        indexerScopedAccept( node->result, *this );
    12631307        if ( node->get_isType() ) {
    1264                 maybeAccept( node->type, *this );
     1308                maybeAccept_impl( node->type, *this );
    12651309        } else {
    1266                 maybeAccept( node->expr, *this );
     1310                maybeAccept_impl( node->expr, *this );
    12671311        }
    12681312
     
    12771321        indexerScopedMutate( node->result, *this );
    12781322        if ( node->get_isType() ) {
    1279                 maybeMutateRef( node->type, *this );
     1323                maybeMutate_impl( node->type, *this );
    12801324        } else {
    1281                 maybeMutateRef( node->expr, *this );
     1325                maybeMutate_impl( node->expr, *this );
    12821326        }
    12831327
     
    12921336
    12931337        indexerScopedAccept( node->result, *this );
    1294         maybeAccept        ( node->type  , *this );
     1338        maybeAccept_impl   ( node->type  , *this );
    12951339
    12961340        VISIT_END( node );
     
    13031347        indexerScopedMutate( node->env   , *this );
    13041348        indexerScopedMutate( node->result, *this );
    1305         maybeMutateRef     ( node->type  , *this );
     1349        maybeMutate_impl   ( node->type  , *this );
    13061350
    13071351        MUTATE_END( Expression, node );
     
    13151359
    13161360        indexerScopedAccept( node->result, *this );
    1317         maybeAccept        ( node->type  , *this );
    1318         maybeAccept        ( node->member, *this );
     1361        maybeAccept_impl   ( node->type  , *this );
     1362        maybeAccept_impl   ( node->member, *this );
    13191363
    13201364        VISIT_END( node );
     
    13271371        indexerScopedMutate( node->env   , *this );
    13281372        indexerScopedMutate( node->result, *this );
    1329         maybeMutateRef     ( node->type  , *this );
    1330         maybeMutateRef     ( node->member, *this );
     1373        maybeMutate_impl   ( node->type  , *this );
     1374        maybeMutate_impl   ( node->member, *this );
    13311375
    13321376        MUTATE_END( Expression, node );
     
    13401384
    13411385        indexerScopedAccept( node->result, *this );
    1342         maybeAccept        ( node->type  , *this );
     1386        maybeAccept_impl   ( node->type  , *this );
    13431387
    13441388        VISIT_END( node );
     
    13511395        indexerScopedMutate( node->env   , *this );
    13521396        indexerScopedMutate( node->result, *this );
    1353         maybeMutateRef     ( node->type  , *this );
     1397        maybeMutate_impl   ( node->type  , *this );
    13541398
    13551399        MUTATE_END( Expression, node );
     
    13641408        indexerScopedAccept( node->result, *this );
    13651409        if ( node->get_isType() ) {
    1366                 maybeAccept( node->type, *this );
     1410                maybeAccept_impl( node->type, *this );
    13671411        } else {
    1368                 maybeAccept( node->expr, *this );
     1412                maybeAccept_impl( node->expr, *this );
    13691413        }
    13701414
     
    13791423        indexerScopedMutate( node->result, *this );
    13801424        if ( node->get_isType() ) {
    1381                 maybeMutateRef( node->type, *this );
     1425                maybeMutate_impl( node->type, *this );
    13821426        } else {
    1383                 maybeMutateRef( node->expr, *this );
     1427                maybeMutate_impl( node->expr, *this );
    13841428        }
    13851429
     
    13941438
    13951439        indexerScopedAccept( node->result, *this );
    1396         maybeAccept        ( node->arg1  , *this );
    1397         maybeAccept        ( node->arg2  , *this );
     1440        maybeAccept_impl   ( node->arg1  , *this );
     1441        maybeAccept_impl   ( node->arg2  , *this );
    13981442
    13991443        VISIT_END( node );
     
    14061450        indexerScopedMutate( node->env   , *this );
    14071451        indexerScopedMutate( node->result, *this );
    1408         maybeMutateRef     ( node->arg1  , *this );
    1409         maybeMutateRef     ( node->arg2  , *this );
     1452        maybeMutate_impl   ( node->arg1  , *this );
     1453        maybeMutate_impl   ( node->arg2  , *this );
    14101454
    14111455        MUTATE_END( Expression, node );
     
    14191463
    14201464        indexerScopedAccept( node->result, *this );
    1421         maybeAccept        ( node->arg1  , *this );
    1422         maybeAccept        ( node->arg2  , *this );
    1423         maybeAccept        ( node->arg3  , *this );
     1465        maybeAccept_impl        ( node->arg1  , *this );
     1466        maybeAccept_impl        ( node->arg2  , *this );
     1467        maybeAccept_impl        ( node->arg3  , *this );
    14241468
    14251469        VISIT_END( node );
     
    14321476        indexerScopedMutate( node->env   , *this );
    14331477        indexerScopedMutate( node->result, *this );
    1434         maybeMutateRef     ( node->arg1  , *this );
    1435         maybeMutateRef     ( node->arg2  , *this );
    1436         maybeMutateRef     ( node->arg3  , *this );
     1478        maybeMutate_impl   ( node->arg1  , *this );
     1479        maybeMutate_impl   ( node->arg2  , *this );
     1480        maybeMutate_impl   ( node->arg3  , *this );
    14371481
    14381482        MUTATE_END( Expression, node );
     
    14461490
    14471491        indexerScopedAccept( node->result, *this );
    1448         maybeAccept        ( node->arg1  , *this );
    1449         maybeAccept        ( node->arg2  , *this );
     1492        maybeAccept_impl   ( node->arg1  , *this );
     1493        maybeAccept_impl   ( node->arg2  , *this );
    14501494
    14511495        VISIT_END( node );
     
    14581502        indexerScopedMutate( node->env   , *this );
    14591503        indexerScopedMutate( node->result, *this );
    1460         maybeMutateRef     ( node->arg1  , *this );
    1461         maybeMutateRef     ( node->arg2  , *this );
     1504        maybeMutate_impl   ( node->arg1  , *this );
     1505        maybeMutate_impl   ( node->arg2  , *this );
    14621506
    14631507        MUTATE_END( Expression, node );
     
    14711515
    14721516        indexerScopedAccept( node->result, *this );
    1473         maybeAccept        ( node->type, *this );
     1517        maybeAccept_impl   ( node->type, *this );
    14741518
    14751519        VISIT_END( node );
     
    14821526        indexerScopedMutate( node->env   , *this );
    14831527        indexerScopedMutate( node->result, *this );
    1484         maybeMutateRef     ( node->type  , *this );
     1528        maybeMutate_impl   ( node->type  , *this );
    14851529
    14861530        MUTATE_END( Expression, node );
     
    14941538
    14951539        indexerScopedAccept( node->result    , *this );
    1496         maybeAccept        ( node->inout     , *this );
    1497         maybeAccept        ( node->constraint, *this );
    1498         maybeAccept        ( node->operand   , *this );
     1540        maybeAccept_impl   ( node->inout     , *this );
     1541        maybeAccept_impl   ( node->constraint, *this );
     1542        maybeAccept_impl   ( node->operand   , *this );
    14991543
    15001544        VISIT_END( node );
     
    15071551        indexerScopedMutate( node->env       , *this );
    15081552        indexerScopedMutate( node->result    , *this );
    1509         maybeMutateRef     ( node->inout     , *this );
    1510         maybeMutateRef     ( node->constraint, *this );
    1511         maybeMutateRef     ( node->operand   , *this );
     1553        maybeMutate_impl   ( node->inout     , *this );
     1554        maybeMutate_impl   ( node->constraint, *this );
     1555        maybeMutate_impl   ( node->operand   , *this );
    15121556
    15131557        MUTATE_END( Expression, node );
     
    15211565
    15221566        indexerScopedAccept( node->result     , *this );
    1523         maybeAccept        ( node->callExpr   , *this );
    1524         maybeAccept        ( node->tempDecls  , *this );
    1525         maybeAccept        ( node->returnDecls, *this );
    1526         maybeAccept        ( node->dtors      , *this );
     1567        maybeAccept_impl   ( node->callExpr   , *this );
     1568        maybeAccept_impl   ( node->tempDecls  , *this );
     1569        maybeAccept_impl   ( node->returnDecls, *this );
     1570        maybeAccept_impl   ( node->dtors      , *this );
    15271571
    15281572        VISIT_END( node );
     
    15351579        indexerScopedMutate( node->env        , *this );
    15361580        indexerScopedMutate( node->result     , *this );
    1537         maybeMutateRef     ( node->callExpr   , *this );
    1538         maybeMutateRef     ( node->tempDecls  , *this );
    1539         maybeMutateRef     ( node->returnDecls, *this );
    1540         maybeMutateRef     ( node->dtors      , *this );
     1581        maybeMutate_impl   ( node->callExpr   , *this );
     1582        maybeMutate_impl   ( node->tempDecls  , *this );
     1583        maybeMutate_impl   ( node->returnDecls, *this );
     1584        maybeMutate_impl   ( node->dtors      , *this );
    15411585
    15421586        MUTATE_END( Expression, node );
     
    15501594
    15511595        indexerScopedAccept( node->result  , *this );
    1552         maybeAccept        ( node->callExpr, *this );
     1596        maybeAccept_impl   ( node->callExpr, *this );
    15531597
    15541598        VISIT_END( node );
     
    15611605        indexerScopedMutate( node->env     , *this );
    15621606        indexerScopedMutate( node->result  , *this );
    1563         maybeMutateRef     ( node->callExpr, *this );
     1607        maybeMutate_impl   ( node->callExpr, *this );
    15641608
    15651609        MUTATE_END( Expression, node );
     
    15731617
    15741618        indexerScopedAccept( node->result     , *this );
    1575         maybeAccept        ( node->initializer, *this );
     1619        maybeAccept_impl   ( node->initializer, *this );
    15761620
    15771621        VISIT_END( node );
     
    15841628        indexerScopedMutate( node->env        , *this );
    15851629        indexerScopedMutate( node->result     , *this );
    1586         maybeMutateRef     ( node->initializer, *this );
     1630        maybeMutate_impl     ( node->initializer, *this );
    15871631
    15881632        MUTATE_END( Expression, node );
     
    15961640
    15971641        indexerScopedAccept( node->result, *this );
    1598         maybeAccept        ( node->low   , *this );
    1599         maybeAccept        ( node->high  , *this );
     1642        maybeAccept_impl   ( node->low   , *this );
     1643        maybeAccept_impl   ( node->high  , *this );
    16001644
    16011645        VISIT_END( node );
     
    16081652        indexerScopedMutate( node->env   , *this );
    16091653        indexerScopedMutate( node->result, *this );
    1610         maybeMutateRef     ( node->low   , *this );
    1611         maybeMutateRef     ( node->high  , *this );
     1654        maybeMutate_impl   ( node->low   , *this );
     1655        maybeMutate_impl   ( node->high  , *this );
    16121656
    16131657        MUTATE_END( Expression, node );
     
    16211665
    16221666        indexerScopedAccept( node->result, *this );
    1623         maybeAccept        ( node->exprs , *this );
     1667        maybeAccept_impl   ( node->exprs , *this );
    16241668
    16251669        VISIT_END( node );
     
    16321676        indexerScopedMutate( node->env   , *this );
    16331677        indexerScopedMutate( node->result, *this );
    1634         maybeMutateRef     ( node->exprs , *this );
     1678        maybeMutate_impl   ( node->exprs , *this );
    16351679
    16361680        MUTATE_END( Expression, node );
     
    16441688
    16451689        indexerScopedAccept( node->result, *this );
    1646         maybeAccept          ( node->exprs , *this );
     1690        maybeAccept_impl   ( node->exprs , *this );
    16471691
    16481692        VISIT_END( node );
     
    16551699        indexerScopedMutate( node->env   , *this );
    16561700        indexerScopedMutate( node->result, *this );
    1657         maybeMutateRef     ( node->exprs , *this );
     1701        maybeMutate_impl   ( node->exprs , *this );
    16581702
    16591703        MUTATE_END( Expression, node );
     
    16671711
    16681712        indexerScopedAccept( node->result, *this );
    1669         maybeAccept        ( node->tuple , *this );
     1713        maybeAccept_impl   ( node->tuple , *this );
    16701714
    16711715        VISIT_END( node );
     
    16781722        indexerScopedMutate( node->env   , *this );
    16791723        indexerScopedMutate( node->result, *this );
    1680         maybeMutateRef     ( node->tuple , *this );
     1724        maybeMutate_impl   ( node->tuple , *this );
    16811725
    16821726        MUTATE_END( Expression, node );
     
    16901734
    16911735        indexerScopedAccept( node->result  , *this );
    1692         maybeAccept        ( node->stmtExpr, *this );
     1736        maybeAccept_impl   ( node->stmtExpr, *this );
    16931737
    16941738        VISIT_END( node );
     
    17011745        indexerScopedMutate( node->env     , *this );
    17021746        indexerScopedMutate( node->result  , *this );
    1703         maybeMutateRef     ( node->stmtExpr, *this );
     1747        maybeMutate_impl   ( node->stmtExpr, *this );
    17041748
    17051749        MUTATE_END( Expression, node );
     
    17181762
    17191763        indexerScopedAccept( node->result     , *this );
    1720         maybeAccept        ( node->statements , *this );
    1721         maybeAccept        ( node->returnDecls, *this );
    1722         maybeAccept        ( node->dtors      , *this );
     1764        maybeAccept_impl   ( node->statements , *this );
     1765        maybeAccept_impl   ( node->returnDecls, *this );
     1766        maybeAccept_impl   ( node->dtors      , *this );
    17231767
    17241768        VISIT_END( node );
     
    17351779
    17361780        indexerScopedMutate( node->result     , *this );
    1737         maybeMutateRef     ( node->statements , *this );
    1738         maybeMutateRef     ( node->returnDecls, *this );
    1739         maybeMutateRef     ( node->dtors      , *this );
     1781        maybeMutate_impl   ( node->statements , *this );
     1782        maybeMutate_impl   ( node->returnDecls, *this );
     1783        maybeMutate_impl   ( node->dtors      , *this );
    17401784
    17411785        MUTATE_END( Expression, node );
     
    17491793
    17501794        indexerScopedAccept( node->result, *this );
    1751         maybeAccept        ( node->expr  , *this );
     1795        maybeAccept_impl   ( node->expr  , *this );
    17521796
    17531797        VISIT_END( node );
     
    17601804        indexerScopedMutate( node->env   , *this );
    17611805        indexerScopedMutate( node->result, *this );
    1762         maybeMutateRef     ( node->expr  , *this );
     1806        maybeMutate_impl   ( node->expr  , *this );
    17631807
    17641808        MUTATE_END( Expression, node );
     
    18051849        {
    18061850                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    1807                 maybeAccept( node->forall    , *this );
    1808                 maybeAccept( node->parameters, *this );
     1851                maybeAccept_impl( node->forall    , *this );
     1852                maybeAccept_impl( node->parameters, *this );
    18091853        }
    18101854
     
    18201864        {
    18211865                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    1822                 maybeMutateRef( node->forall    , *this );
    1823                 maybeMutateRef( node->parameters, *this );
     1866                maybeMutate_impl( node->forall    , *this );
     1867                maybeMutate_impl( node->parameters, *this );
    18241868        }
    18251869
     
    18371881        {
    18381882                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    1839                 maybeAccept( node->forall    , *this );
    1840                 maybeAccept( node->parameters, *this );
     1883                maybeAccept_impl( node->forall    , *this );
     1884                maybeAccept_impl( node->parameters, *this );
    18411885        }
    18421886
     
    18521896        {
    18531897                auto guard = makeFuncGuard( [this]() { indexerScopeEnter(); }, [this]() { indexerScopeLeave(); } );
    1854                 maybeMutateRef( node->forall    , *this );
    1855                 maybeMutateRef( node->parameters, *this );
     1898                maybeMutate_impl( node->forall    , *this );
     1899                maybeMutate_impl( node->parameters, *this );
    18561900        }
    18571901
     
    18771921        VISIT_START( node );
    18781922
    1879         maybeAccept( node->forall    , *this );
    1880         maybeAccept( node->parameters, *this );
     1923        maybeAccept_impl( node->forall    , *this );
     1924        maybeAccept_impl( node->parameters, *this );
    18811925
    18821926        VISIT_END( node );
     
    18871931        MUTATE_START( node );
    18881932
    1889         maybeMutateRef( node->forall    , *this );
    1890         maybeMutateRef( node->parameters, *this );
     1933        maybeMutate_impl( node->forall    , *this );
     1934        maybeMutate_impl( node->parameters, *this );
    18911935
    18921936        MUTATE_END( Type, node );
     
    19341978        VISIT_START( node );
    19351979
    1936         maybeAccept( node->get_designators(), *this );
     1980        maybeAccept_impl( node->get_designators(), *this );
    19371981
    19381982        VISIT_END( node );
     
    19431987        MUTATE_START( node );
    19441988
    1945         maybeMutateRef( node->get_designators(), *this );
     1989        maybeMutate_impl( node->get_designators(), *this );
    19461990
    19471991        MUTATE_END( Designation, node );
     
    20832127        MUTATE_BODY( Attribute, node );
    20842128}
     2129
     2130template< typename pass_type >
     2131TypeSubstitution * PassVisitor< pass_type >::mutate( TypeSubstitution * node ) {
     2132        MUTATE_START( node );
     2133
     2134        for ( auto & p : node->typeEnv ) {
     2135                indexerScopedMutate( p.second, *this );
     2136        }
     2137        for ( auto & p : node->varEnv ) {
     2138                indexerScopedMutate( p.second, *this );
     2139        }
     2140
     2141        MUTATE_END( TypeSubstitution, node );
     2142}
  • src/Common/PassVisitor.proto.h

    rb96ec83 r6840e7c  
    4646        ~bool_ref() = default;
    4747
    48         operator bool() { return *m_ref; }
     48        operator bool() { return m_ref ? *m_ref : true; }
    4949        bool operator=( bool val ) { return *m_ref = val; }
    5050
    5151private:
    5252
    53         template<typename pass>
    54         friend class PassVisitor;
    55 
    56         void set( bool & val ) { m_ref = &val; };
    57 
    58         bool * m_ref;
     53        friend class ChildrenGuard;
     54
     55        bool * set( bool & val ) {
     56                bool * prev = m_ref;
     57                m_ref = &val;
     58                return prev;
     59        }
     60
     61        bool * m_ref = nullptr;
    5962};
    6063
    61 template< typename TreeType, typename VisitorType >
    62 inline void indexerScopedAccept( TreeType * tree, VisitorType & visitor ) {
    63         auto guard = makeFuncGuard(
    64                 [&visitor]() { visitor.indexerScopeEnter(); },
    65                 [&visitor]() { visitor.indexerScopeLeave(); }
    66         );
    67         maybeAccept( tree, visitor );
    68 }
    69 
    70 template< typename TreeType, typename MutatorType >
    71 inline void indexerScopedMutate( TreeType *& tree, MutatorType & mutator ) {
    72         auto guard = makeFuncGuard(
    73                 [&mutator]() { mutator.indexerScopeEnter(); },
    74                 [&mutator]() { mutator.indexerScopeLeave(); }
    75         );
    76         tree = maybeMutate( tree, mutator );
    77 }
    78 
    79 template< typename TreeType, typename MutatorType >
    80 inline void maybeMutateRef( TreeType *& tree, MutatorType & mutator ) {
    81         tree = maybeMutate( tree, mutator );
    82 }
     64class ChildrenGuard {
     65public:
     66
     67        ChildrenGuard( bool_ref * ref )
     68                : m_val ( true )
     69                , m_prev( ref ? ref->set( m_val ) : nullptr )
     70                , m_ref ( ref )
     71        {}
     72
     73        ~ChildrenGuard() {
     74                if( m_ref ) {
     75                        m_ref->set( *m_prev );
     76                }
     77        }
     78
     79        operator bool() { return m_val; }
     80
     81private:
     82        bool       m_val;
     83        bool     * m_prev;
     84        bool_ref * m_ref;
     85};
    8386
    8487//-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  • src/Common/utility.h

    rb96ec83 r6840e7c  
    2828#include <cassert>
    2929
     30#include "Common/Indenter.h"
     31
    3032template< typename T >
    3133static inline T * maybeClone( const T *orig ) {
     
    7577
    7678template< typename Container >
    77 void printAll( const Container &container, std::ostream &os, int indent = 0 ) {
     79void printAll( const Container &container, std::ostream &os, Indenter indent = {} ) {
    7880        for ( typename Container::const_iterator i = container.begin(); i != container.end(); ++i ) {
    7981                if ( *i ) {
    80                         os << std::string( indent,  ' ' );
    81                         (*i)->print( os, indent + 2 );
     82                        os << indent;
     83                        (*i)->print( os, indent );
    8284                        // need an endl after each element because it's not easy to know when each individual item should end
    8385                        os << std::endl;
     
    351353template< typename T1, typename T2 >
    352354struct group_iterate_t {
     355private:
     356        std::tuple<T1, T2> args;
     357public:
    353358        group_iterate_t( bool skipBoundsCheck, const T1 & v1, const T2 & v2 ) : args(v1, v2) {
    354359                assertf(skipBoundsCheck || v1.size() == v2.size(), "group iteration requires containers of the same size: <%zd, %zd>.", v1.size(), v2.size());
    355360        };
    356361
     362        typedef std::tuple<decltype(*std::get<0>(args).begin()), decltype(*std::get<1>(args).begin())> value_type;
     363        typedef decltype(std::get<0>(args).begin()) T1Iter;
     364        typedef decltype(std::get<1>(args).begin()) T2Iter;
     365
    357366        struct iterator {
    358                 typedef typename std::remove_reference<T1>::type T1val;
    359                 typedef typename std::remove_reference<T2>::type T2val;
    360                 typedef std::tuple<typename T1val::value_type &, typename T2val::value_type &> value_type;
    361                 typedef typename T1val::iterator T1Iter;
    362                 typedef typename T2val::iterator T2Iter;
    363367                typedef std::tuple<T1Iter, T2Iter> IterTuple;
    364368                IterTuple it;
     
    370374                value_type operator*() const { return std::tie( *std::get<0>(it), *std::get<1>(it) ); }
    371375        };
     376
    372377        iterator begin() { return iterator( std::get<0>(args).begin(), std::get<1>(args).begin() ); }
    373378        iterator end() { return iterator( std::get<0>(args).end(), std::get<1>(args).end() ); }
    374 
    375 private:
    376         std::tuple<T1, T2> args;
    377379};
    378380
  • src/Concurrency/Keywords.cc

    rb96ec83 r6840e7c  
    196196                std::list<DeclarationWithType*> findMutexArgs( FunctionDecl* );
    197197                void validate( DeclarationWithType * );
     198                void addDtorStatments( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
    198199                void addStatments( FunctionDecl* func, CompoundStmt *, const std::list<DeclarationWithType * > &);
    199200
     
    206207                StructDecl* monitor_decl = nullptr;
    207208                StructDecl* guard_decl = nullptr;
     209                StructDecl* dtor_guard_decl = nullptr;
    208210
    209211                static std::unique_ptr< Type > generic_func;
     
    229231
    230232                void postvisit( FunctionDecl * decl );
     233                void previsit ( StructDecl   * decl );
    231234
    232235                void addStartStatement( FunctionDecl * decl, DeclarationWithType * param );
     
    236239                        acceptAll( translationUnit, impl );
    237240                }
     241
     242          private :
     243                bool thread_ctor_seen = false;
     244                StructDecl * thread_decl = nullptr;
    238245        };
    239246
     
    403410                if( mutexArgs.empty() ) return;
    404411
     412                if( CodeGen::isConstructor(decl->name) ) throw SemanticError( "constructors cannot have mutex parameters", decl );
     413
     414                bool isDtor = CodeGen::isDestructor( decl->name );
     415
     416                if( isDtor && mutexArgs.size() != 1 ) throw SemanticError( "destructors can only have 1 mutex argument", decl );
     417
    405418                for(auto arg : mutexArgs) {
    406419                        validate( arg );
     
    412425                if( !monitor_decl ) throw SemanticError( "mutex keyword requires monitors to be in scope, add #include <monitor>", decl );
    413426                if( !guard_decl ) throw SemanticError( "mutex keyword requires monitors to be in scope, add #include <monitor>", decl );
    414 
    415                 addStatments( decl, body, mutexArgs );
     427                if( !dtor_guard_decl ) throw SemanticError( "mutex keyword requires monitors to be in scope, add #include <monitor>", decl );
     428
     429                if( isDtor ) {
     430                        addDtorStatments( decl, body, mutexArgs );
     431                }
     432                else {
     433                        addStatments( decl, body, mutexArgs );
     434                }
    416435        }
    417436
     
    425444                        assert( !guard_decl );
    426445                        guard_decl = decl;
     446                }
     447                else if( decl->name == "monitor_dtor_guard_t" ) {
     448                        assert( !dtor_guard_decl );
     449                        dtor_guard_decl = decl;
    427450                }
    428451        }
     
    457480                //Make sure that typed isn't mutex
    458481                if( base->get_mutex() ) throw SemanticError( "mutex keyword may only appear once per argument ", arg );
     482        }
     483
     484        void MutexKeyword::addDtorStatments( FunctionDecl* func, CompoundStmt * body, const std::list<DeclarationWithType * > & args ) {
     485                Type * arg_type = args.front()->get_type()->clone();
     486                arg_type->set_mutex( false );
     487
     488                ObjectDecl * monitors = new ObjectDecl(
     489                        "__monitor",
     490                        noStorage,
     491                        LinkageSpec::Cforall,
     492                        nullptr,
     493                        new PointerType(
     494                                noQualifiers,
     495                                new StructInstType(
     496                                        noQualifiers,
     497                                        monitor_decl
     498                                )
     499                        ),
     500                        new SingleInit( new UntypedExpr(
     501                                new NameExpr( "get_monitor" ),
     502                                {  new CastExpr( new VariableExpr( args.front() ), arg_type ) }
     503                        ))
     504                );
     505
     506                assert(generic_func);
     507
     508                //in reverse order :
     509                // monitor_guard_t __guard = { __monitors, #, func };
     510                body->push_front(
     511                        new DeclStmt( noLabels, new ObjectDecl(
     512                                "__guard",
     513                                noStorage,
     514                                LinkageSpec::Cforall,
     515                                nullptr,
     516                                new StructInstType(
     517                                        noQualifiers,
     518                                        dtor_guard_decl
     519                                ),
     520                                new ListInit(
     521                                        {
     522                                                new SingleInit( new AddressExpr( new VariableExpr( monitors ) ) ),
     523                                                new SingleInit( new CastExpr( new VariableExpr( func ), generic_func->clone() ) )
     524                                        },
     525                                        noDesignators,
     526                                        true
     527                                )
     528                        ))
     529                );
     530
     531                //monitor_desc * __monitors[] = { get_monitor(a), get_monitor(b) };
     532                body->push_front( new DeclStmt( noLabels, monitors) );
    459533        }
    460534
     
    523597        // General entry routine
    524598        //=============================================================================================
     599        void ThreadStarter::previsit( StructDecl * decl ) {
     600                if( decl->name == "thread_desc" && decl->body ) {
     601                        assert( !thread_decl );
     602                        thread_decl = decl;
     603                }
     604        }
     605
    525606        void ThreadStarter::postvisit(FunctionDecl * decl) {
    526607                if( ! CodeGen::isConstructor(decl->name) ) return;
     608
     609                Type * typeof_this = InitTweak::getTypeofThis(decl->type);
     610                StructInstType * ctored_type = dynamic_cast< StructInstType * >( typeof_this );
     611                if( ctored_type && ctored_type->baseStruct == thread_decl ) {
     612                        thread_ctor_seen = true;
     613                }
    527614
    528615                DeclarationWithType * param = decl->get_functionType()->get_parameters().front();
    529616                auto type  = dynamic_cast< StructInstType * >( InitTweak::getPointerBase( param->get_type() ) );
    530617                if( type && type->get_baseStruct()->is_thread() ) {
     618                        if( !thread_decl || !thread_ctor_seen ) {
     619                                throw SemanticError("thread keyword requires threads to be in scope, add #include <thread>");
     620                        }
     621
    531622                        addStartStatement( decl, param );
    532623                }
  • src/Concurrency/Waitfor.cc

    rb96ec83 r6840e7c  
    190190
    191191                Statement * makeAccStatement( DeclarationWithType * object, unsigned long index, const std::string & member, Expression * value, const SymTab::Indexer & indexer ) {
    192                         std::unique_ptr< Expression > expr( makeOpAssign(
     192                        Expression * expr = makeOpAssign(
    193193                                makeOpMember(
    194194                                        makeOpIndex(
     
    199199                                ),
    200200                                value
    201                         ) );
    202 
    203                         return new ExprStmt( noLabels, ResolvExpr::findVoidExpression( expr.get(), indexer ) );
     201                        );
     202
     203                        ResolvExpr::findVoidExpression( expr, indexer );
     204
     205                        return new ExprStmt( noLabels, expr );
    204206                }
    205207
     
    313315                stmt->push_back( new DeclStmt( noLabels, acceptables) );
    314316
    315                 UntypedExpr * set = new UntypedExpr(
     317                Expression * set = new UntypedExpr(
    316318                        new NameExpr( "__builtin_memset" ),
    317319                        {
     
    322324                );
    323325
    324                 Expression * resolved_set = ResolvExpr::findVoidExpression( set, indexer );
    325                 delete set;
    326 
    327                 stmt->push_back( new ExprStmt( noLabels, resolved_set ) );
     326                ResolvExpr::findVoidExpression( set, indexer );
     327
     328                stmt->push_back( new ExprStmt( noLabels, set ) );
    328329
    329330                return acceptables;
     
    346347
    347348        Statement * GenerateWaitForPass::makeSetter( ObjectDecl * flag ) {
    348                 Expression * untyped = new UntypedExpr(
     349                Expression * expr = new UntypedExpr(
    349350                        new NameExpr( "?=?" ),
    350351                        {
     
    354355                );
    355356
    356                 Expression * expr = ResolvExpr::findVoidExpression( untyped, indexer );
    357                 delete untyped;
     357                ResolvExpr::findVoidExpression( expr, indexer );
    358358
    359359                return new ExprStmt( noLabels, expr );
     
    379379                        new ListInit(
    380380                                map_range < std::list<Initializer*> > ( clause.target.arguments, [this](Expression * expr ){
    381                                         Expression * untyped = new CastExpr(
     381                                        Expression * init = new CastExpr(
    382382                                                new UntypedExpr(
    383383                                                        new NameExpr( "get_monitor" ),
     
    393393                                        );
    394394
    395                                         Expression * init = ResolvExpr::findSingleExpression( untyped, indexer );
    396                                         delete untyped;
     395                                        ResolvExpr::findSingleExpression( init, indexer );
    397396                                        return new SingleInit( init );
    398397                                })
  • src/GenPoly/Box.cc

    rb96ec83 r6840e7c  
    600600
    601601                        // add size/align for generic types to parameter list
    602                         if ( ! appExpr->get_function()->has_result() ) return;
     602                        if ( ! appExpr->get_function()->result ) return;
    603603                        FunctionType *funcType = getFunctionType( appExpr->get_function()->get_result() );
    604604                        assert( funcType );
     
    714714
    715715                void Pass1::boxParam( Type *param, Expression *&arg, const TyVarMap &exprTyVars ) {
    716                         assertf( arg->has_result(), "arg does not have result: %s", toString( arg ).c_str() );
    717                         if ( isPolyType( param, exprTyVars ) ) {
    718                                 Type * newType = arg->get_result()->clone();
     716                        assertf( arg->result, "arg does not have result: %s", toString( arg ).c_str() );
     717                        if ( ! needsBoxing( param, arg->result, exprTyVars, env ) ) return;
     718
     719                        if ( arg->result->get_lvalue() ) {
     720                                // argument expression may be CFA lvalue, but not C lvalue -- apply generalizedLvalue transformations.
     721                                // if ( VariableExpr * varExpr = dynamic_cast< VariableExpr * >( arg ) ) {
     722                                //      if ( dynamic_cast<ArrayType *>( varExpr->var->get_type() ) ){
     723                                //              // temporary hack - don't box arrays, because &arr is not the same as &arr[0]
     724                                //              return;
     725                                //      }
     726                                // }
     727                                arg =  generalizedLvalue( new AddressExpr( arg ) );
     728                                if ( ! ResolvExpr::typesCompatible( param, arg->get_result(), SymTab::Indexer() ) ) {
     729                                        // silence warnings by casting boxed parameters when the actual type does not match up with the formal type.
     730                                        arg = new CastExpr( arg, param->clone() );
     731                                }
     732                        } else {
     733                                // use type computed in unification to declare boxed variables
     734                                Type * newType = param->clone();
    719735                                if ( env ) env->apply( newType );
    720                                 std::unique_ptr<Type> manager( newType );
    721                                 if ( isPolyType( newType ) ) {
    722                                         // if the argument's type is polymorphic, we don't need to box again!
    723                                         return;
    724                                 } else if ( arg->get_result()->get_lvalue() ) {
    725                                         // argument expression may be CFA lvalue, but not C lvalue -- apply generalizedLvalue transformations.
    726                                         // if ( VariableExpr * varExpr = dynamic_cast< VariableExpr * >( arg ) ) {
    727                                         //      if ( dynamic_cast<ArrayType *>( varExpr->var->get_type() ) ){
    728                                         //              // temporary hack - don't box arrays, because &arr is not the same as &arr[0]
    729                                         //              return;
    730                                         //      }
    731                                         // }
    732                                         arg =  generalizedLvalue( new AddressExpr( arg ) );
    733                                         if ( ! ResolvExpr::typesCompatible( param, arg->get_result(), SymTab::Indexer() ) ) {
    734                                                 // silence warnings by casting boxed parameters when the actual type does not match up with the formal type.
    735                                                 arg = new CastExpr( arg, param->clone() );
    736                                         }
    737                                 } else {
    738                                         // use type computed in unification to declare boxed variables
    739                                         Type * newType = param->clone();
    740                                         if ( env ) env->apply( newType );
    741                                         ObjectDecl *newObj = new ObjectDecl( tempNamer.newName(), Type::StorageClasses(), LinkageSpec::C, 0, newType, 0 );
    742                                         newObj->get_type()->get_qualifiers() = Type::Qualifiers(); // TODO: is this right???
    743                                         stmtsToAddBefore.push_back( new DeclStmt( noLabels, newObj ) );
    744                                         UntypedExpr *assign = new UntypedExpr( new NameExpr( "?=?" ) ); // TODO: why doesn't this just use initialization syntax?
    745                                         assign->get_args().push_back( new VariableExpr( newObj ) );
    746                                         assign->get_args().push_back( arg );
    747                                         stmtsToAddBefore.push_back( new ExprStmt( noLabels, assign ) );
    748                                         arg = new AddressExpr( new VariableExpr( newObj ) );
    749                                 } // if
     736                                ObjectDecl *newObj = new ObjectDecl( tempNamer.newName(), Type::StorageClasses(), LinkageSpec::C, 0, newType, 0 );
     737                                newObj->get_type()->get_qualifiers() = Type::Qualifiers(); // TODO: is this right???
     738                                stmtsToAddBefore.push_back( new DeclStmt( noLabels, newObj ) );
     739                                UntypedExpr *assign = new UntypedExpr( new NameExpr( "?=?" ) ); // TODO: why doesn't this just use initialization syntax?
     740                                assign->get_args().push_back( new VariableExpr( newObj ) );
     741                                assign->get_args().push_back( arg );
     742                                stmtsToAddBefore.push_back( new ExprStmt( noLabels, assign ) );
     743                                arg = new AddressExpr( new VariableExpr( newObj ) );
    750744                        } // if
    751745                }
     
    965959                                if ( varExpr->get_var()->get_linkage() == LinkageSpec::Intrinsic ) {
    966960                                        if ( varExpr->get_var()->get_name() == "?[?]" ) {
    967                                                 assert( appExpr->has_result() );
     961                                                assert( appExpr->result );
    968962                                                assert( appExpr->get_args().size() == 2 );
    969963                                                Type *baseType1 = isPolyPtr( appExpr->get_args().front()->get_result(), scopeTyVars, env );
     
    999993                                                } // if
    1000994                                        } else if ( varExpr->get_var()->get_name() == "*?" ) {
    1001                                                 assert( appExpr->has_result() );
     995                                                assert( appExpr->result );
    1002996                                                assert( ! appExpr->get_args().empty() );
    1003997                                                if ( isPolyType( appExpr->get_result(), scopeTyVars, env ) ) {
     
    10161010                                                } // if
    10171011                                        } else if ( varExpr->get_var()->get_name() == "?++" || varExpr->get_var()->get_name() == "?--" ) {
    1018                                                 assert( appExpr->has_result() );
     1012                                                assert( appExpr->result );
    10191013                                                assert( appExpr->get_args().size() == 1 );
    10201014                                                if ( Type *baseType = isPolyPtr( appExpr->get_result(), scopeTyVars, env ) ) {
     
    10361030                                                } // if
    10371031                                        } else if ( varExpr->get_var()->get_name() == "++?" || varExpr->get_var()->get_name() == "--?" ) {
    1038                                                 assert( appExpr->has_result() );
     1032                                                assert( appExpr->result );
    10391033                                                assert( appExpr->get_args().size() == 1 );
    10401034                                                if ( Type *baseType = isPolyPtr( appExpr->get_result(), scopeTyVars, env ) ) {
     
    10421036                                                } // if
    10431037                                        } else if ( varExpr->get_var()->get_name() == "?+?" || varExpr->get_var()->get_name() == "?-?" ) {
    1044                                                 assert( appExpr->has_result() );
     1038                                                assert( appExpr->result );
    10451039                                                assert( appExpr->get_args().size() == 2 );
    10461040                                                Type *baseType1 = isPolyPtr( appExpr->get_args().front()->get_result(), scopeTyVars, env );
     
    10681062                                                } // if
    10691063                                        } else if ( varExpr->get_var()->get_name() == "?+=?" || varExpr->get_var()->get_name() == "?-=?" ) {
    1070                                                 assert( appExpr->has_result() );
     1064                                                assert( appExpr->result );
    10711065                                                assert( appExpr->get_args().size() == 2 );
    10721066                                                Type *baseType = isPolyPtr( appExpr->get_result(), scopeTyVars, env );
     
    11621156                void Pass1::premutate( AddressExpr * ) { visit_children = false; }
    11631157                Expression * Pass1::postmutate( AddressExpr * addrExpr ) {
    1164                         assert( addrExpr->get_arg()->has_result() && ! addrExpr->get_arg()->get_result()->isVoid() );
     1158                        assert( addrExpr->get_arg()->result && ! addrExpr->get_arg()->get_result()->isVoid() );
    11651159
    11661160                        bool needs = false;
    11671161                        if ( UntypedExpr *expr = dynamic_cast< UntypedExpr *>( addrExpr->get_arg() ) ) {
    1168                                 if ( expr->has_result() && isPolyType( expr->get_result(), scopeTyVars, env ) ) {
     1162                                if ( expr->result && isPolyType( expr->get_result(), scopeTyVars, env ) ) {
    11691163                                        if ( NameExpr *name = dynamic_cast< NameExpr *>( expr->get_function() ) ) {
    11701164                                                if ( name->get_name() == "*?" ) {
    11711165                                                        if ( ApplicationExpr * appExpr = dynamic_cast< ApplicationExpr * >( expr->get_args().front() ) ) {
    1172                                                                 assert( appExpr->get_function()->has_result() );
     1166                                                                assert( appExpr->get_function()->result );
    11731167                                                                FunctionType *function = getFunctionType( appExpr->get_function()->get_result() );
    11741168                                                                assert( function );
  • src/GenPoly/FindFunction.cc

    rb96ec83 r6840e7c  
    1818#include <utility>                      // for pair
    1919
     20#include "Common/PassVisitor.h"         // for PassVisitor
    2021#include "Common/SemanticError.h"       // for SemanticError
    2122#include "GenPoly/ErasableScopedMap.h"  // for ErasableScopedMap<>::iterator
     
    2728
    2829namespace GenPoly {
    29         class FindFunction : public Mutator {
     30        class FindFunction : public WithGuards, public WithVisitorRef<FindFunction>, public WithShortCircuiting {
    3031          public:
    3132                FindFunction( std::list< FunctionType* > &functions, const TyVarMap &tyVars, bool replaceMode, FindFunctionPredicate predicate );
    3233
    33                 virtual Type *mutate( FunctionType *functionType );
    34                 virtual Type *mutate( PointerType *pointerType );
     34                void premutate( FunctionType * functionType );
     35                Type * postmutate( FunctionType * functionType );
     36                void premutate( PointerType * pointerType );
    3537          private:
    3638                void handleForall( const Type::ForallList &forall );
     
    4345
    4446        void findFunction( Type *type, std::list< FunctionType* > &functions, const TyVarMap &tyVars, FindFunctionPredicate predicate ) {
    45                 FindFunction finder( functions, tyVars, false, predicate );
     47                PassVisitor<FindFunction> finder( functions, tyVars, false, predicate );
    4648                type->acceptMutator( finder );
    4749        }
    4850
    4951        void findAndReplaceFunction( Type *&type, std::list< FunctionType* > &functions, const TyVarMap &tyVars, FindFunctionPredicate predicate ) {
    50                 FindFunction finder( functions, tyVars, true, predicate );
     52                PassVisitor<FindFunction> finder( functions, tyVars, true, predicate );
    5153                type = type->acceptMutator( finder );
    5254        }
     
    5759
    5860        void FindFunction::handleForall( const Type::ForallList &forall ) {
    59                 for ( Type::ForallList::const_iterator i = forall.begin(); i != forall.end(); ++i ) {
    60                         TyVarMap::iterator var = tyVars.find( (*i)->get_name() );
     61                for ( const Declaration * td : forall ) {
     62                        TyVarMap::iterator var = tyVars.find( td->name );
    6163                        if ( var != tyVars.end() ) {
    6264                                tyVars.erase( var->first );
     
    6567        }
    6668
    67         Type * FindFunction::mutate( FunctionType *functionType ) {
    68                 tyVars.beginScope();
     69        void FindFunction::premutate( FunctionType * functionType ) {
     70                visit_children = false;
     71                GuardScope( tyVars );
    6972                handleForall( functionType->get_forall() );
    70                 mutateAll( functionType->get_returnVals(), *this );
     73                mutateAll( functionType->get_returnVals(), *visitor );
     74        }
     75
     76        Type * FindFunction::postmutate( FunctionType * functionType ) {
    7177                Type *ret = functionType;
    7278                if ( predicate( functionType, tyVars ) ) {
     
    7783                        } // if
    7884                } // if
    79                 tyVars.endScope();
    8085                return ret;
    8186        }
    8287
    83         Type * FindFunction::mutate( PointerType *pointerType ) {
    84                 tyVars.beginScope();
     88        void FindFunction::premutate( PointerType * pointerType ) {
     89                GuardScope( tyVars );
    8590                handleForall( pointerType->get_forall() );
    86                 Type *ret = Mutator::mutate( pointerType );
    87                 tyVars.endScope();
    88                 return ret;
    8991        }
    9092} // namespace GenPoly
  • src/GenPoly/GenPoly.cc

    rb96ec83 r6840e7c  
    432432        }
    433433
     434        bool needsBoxing( Type * param, Type * arg, const TyVarMap &exprTyVars, TypeSubstitution * env ) {
     435                // is parameter is not polymorphic, don't need to box
     436                if ( ! isPolyType( param, exprTyVars ) ) return false;
     437                Type * newType = arg->clone();
     438                if ( env ) env->apply( newType );
     439                std::unique_ptr<Type> manager( newType );
     440                // if the argument's type is polymorphic, we don't need to box again!
     441                return ! isPolyType( newType );
     442        }
     443
     444        bool needsBoxing( Type * param, Type * arg, ApplicationExpr * appExpr, TypeSubstitution * env ) {
     445                FunctionType * function = getFunctionType( appExpr->function->result );
     446                assertf( function, "ApplicationExpr has non-function type: %s", toString( appExpr->function->result ).c_str() );
     447                TyVarMap exprTyVars( TypeDecl::Data{} );
     448                makeTyVarMap( function, exprTyVars );
     449                return needsBoxing( param, arg, exprTyVars, env );
     450        }
     451
    434452        void addToTyVarMap( TypeDecl * tyVar, TyVarMap &tyVarMap ) {
    435453                // xxx - should this actually be insert?
  • src/GenPoly/GenPoly.h

    rb96ec83 r6840e7c  
    8080        bool typesPolyCompatible( Type *aty, Type *bty );
    8181
     82        /// true if arg requires boxing given exprTyVars
     83        bool needsBoxing( Type * param, Type * arg, const TyVarMap &exprTyVars, TypeSubstitution * env );
     84
     85        /// true if arg requires boxing in the call to appExpr
     86        bool needsBoxing( Type * param, Type * arg, ApplicationExpr * appExpr, TypeSubstitution * env );
     87
    8288        /// Adds the type variable `tyVar` to `tyVarMap`
    8389        void addToTyVarMap( TypeDecl * tyVar, TyVarMap &tyVarMap );
  • src/GenPoly/Specialize.cc

    rb96ec83 r6840e7c  
    147147
    148148        Expression * Specialize::doSpecialization( Type *formalType, Expression *actual, InferredParams *inferParams ) {
    149                 assertf( actual->has_result(), "attempting to specialize an untyped expression" );
     149                assertf( actual->result, "attempting to specialize an untyped expression" );
    150150                if ( needsSpecialization( formalType, actual->get_result(), env ) ) {
    151151                        if ( FunctionType *funType = getFunctionType( formalType ) ) {
  • src/GenPoly/module.mk

    rb96ec83 r6840e7c  
    2020       GenPoly/Lvalue.cc \
    2121       GenPoly/Specialize.cc \
    22        GenPoly/CopyParams.cc \
    2322       GenPoly/FindFunction.cc \
    2423       GenPoly/InstantiateGeneric.cc
  • src/InitTweak/FixInit.cc

    rb96ec83 r6840e7c  
    9494                        /// true if type does not need to be copy constructed to ensure correctness
    9595                        bool skipCopyConstruct( Type * type );
    96                         void copyConstructArg( Expression *& arg, ImplicitCopyCtorExpr * impCpCtorExpr );
     96                        void copyConstructArg( Expression *& arg, ImplicitCopyCtorExpr * impCpCtorExpr, Type * formal );
    9797                        void destructRet( ObjectDecl * ret, ImplicitCopyCtorExpr * impCpCtorExpr );
    9898
     
    259259
    260260                GenStructMemberCalls::generate( translationUnit );
     261
    261262                // xxx - ctor expansion currently has to be after FixCopyCtors, because there is currently a
    262263                // hack in the way untyped assignments are generated, where the first argument cannot have
     
    288289                        for ( std::list< Declaration * >::iterator i = translationUnit.begin(); i != translationUnit.end(); ++i ) {
    289290                                try {
    290                                         *i = maybeMutate( *i, fixer );
     291                                        maybeMutate( *i, fixer );
    291292                                        translationUnit.splice( i, fixer.pass.staticDtorDecls );
    292293                                } catch( SemanticError &e ) {
     
    322323
    323324                Expression * InsertImplicitCalls::postmutate( ApplicationExpr * appExpr ) {
    324                         assert( appExpr );
    325 
    326325                        if ( VariableExpr * function = dynamic_cast< VariableExpr * > ( appExpr->get_function() ) ) {
    327                                 if ( LinkageSpec::isBuiltin( function->get_var()->get_linkage() ) ) {
     326                                if ( function->var->linkage.is_builtin ) {
    328327                                        // optimization: don't need to copy construct in order to call intrinsic functions
    329328                                        return appExpr;
     
    331330                                        FunctionType * ftype = dynamic_cast< FunctionType * >( GenPoly::getFunctionType( funcDecl->get_type() ) );
    332331                                        assertf( ftype, "Function call without function type: %s", toString( funcDecl ).c_str() );
    333                                         if ( CodeGen::isConstructor( funcDecl->get_name() ) && ftype->get_parameters().size() == 2 ) {
    334                                                 Type * t1 = getPointerBase( ftype->get_parameters().front()->get_type() );
    335                                                 Type * t2 = ftype->get_parameters().back()->get_type();
     332                                        if ( CodeGen::isConstructor( funcDecl->get_name() ) && ftype->parameters.size() == 2 ) {
     333                                                Type * t1 = getPointerBase( ftype->parameters.front()->get_type() );
     334                                                Type * t2 = ftype->parameters.back()->get_type();
    336335                                                assert( t1 );
    337336
     
    366365                        ImplicitCtorDtorStmt * stmt = genCtorDtor( fname, var, cpArg );
    367366                        ExprStmt * exprStmt = strict_dynamic_cast< ExprStmt * >( stmt->get_callStmt() );
    368                         Expression * untyped = exprStmt->get_expr();
     367                        Expression * resolved = exprStmt->expr;
     368                        exprStmt->expr = nullptr; // take ownership of expr
    369369
    370370                        // resolve copy constructor
    371371                        // should only be one alternative for copy ctor and dtor expressions, since all arguments are fixed
    372372                        // (VariableExpr and already resolved expression)
    373                         CP_CTOR_PRINT( std::cerr << "ResolvingCtorDtor " << untyped << std::endl; )
    374                         Expression * resolved = ResolvExpr::findVoidExpression( untyped, indexer );
     373                        CP_CTOR_PRINT( std::cerr << "ResolvingCtorDtor " << resolved << std::endl; )
     374                        ResolvExpr::findVoidExpression( resolved, indexer );
    375375                        assert( resolved );
    376376                        if ( resolved->get_env() ) {
     
    380380                                resolved->set_env( nullptr );
    381381                        } // if
    382 
    383382                        delete stmt;
    384383                        return resolved;
    385384                }
    386385
    387                 void ResolveCopyCtors::copyConstructArg( Expression *& arg, ImplicitCopyCtorExpr * impCpCtorExpr ) {
     386                void ResolveCopyCtors::copyConstructArg( Expression *& arg, ImplicitCopyCtorExpr * impCpCtorExpr, Type * formal ) {
    388387                        static UniqueName tempNamer("_tmp_cp");
    389388                        assert( env );
    390389                        CP_CTOR_PRINT( std::cerr << "Type Substitution: " << *env << std::endl; )
    391                         assert( arg->has_result() );
    392                         Type * result = arg->get_result();
     390                        assert( arg->result );
     391                        Type * result = arg->result;
    393392                        if ( skipCopyConstruct( result ) ) return; // skip certain non-copyable types
    394393
    395                         // type may involve type variables, so apply type substitution to get temporary variable's actual type
     394                        // type may involve type variables, so apply type substitution to get temporary variable's actual type.
     395                        // Use applyFree so that types bound in function pointers are not substituted, e.g. in forall(dtype T) void (*)(T).
    396396                        result = result->clone();
    397                         env->apply( result );
     397                        env->applyFree( result );
    398398                        ObjectDecl * tmp = ObjectDecl::newObject( "__tmp", result, nullptr );
    399399                        tmp->get_type()->set_const( false );
     
    406406                                // if the chosen constructor is intrinsic, the copy is unnecessary, so
    407407                                // don't create the temporary and don't call the copy constructor
    408                                 VariableExpr * function = dynamic_cast< VariableExpr * >( appExpr->get_function() );
    409                                 assert( function );
    410                                 if ( function->get_var()->get_linkage() == LinkageSpec::Intrinsic ) return;
     408                                VariableExpr * function = strict_dynamic_cast< VariableExpr * >( appExpr->function );
     409                                if ( function->var->linkage == LinkageSpec::Intrinsic ) {
     410                                        // arguments that need to be boxed need a temporary regardless of whether the copy constructor is intrinsic,
     411                                        // so that the object isn't changed inside of the polymorphic function
     412                                        if ( ! GenPoly::needsBoxing( formal, result, impCpCtorExpr->callExpr, env ) ) return;
     413                                }
    411414                        }
    412415
     
    416419                        // replace argument to function call with temporary
    417420                        arg = new CommaExpr( cpCtor, new VariableExpr( tmp ) );
    418                         impCpCtorExpr->get_tempDecls().push_back( tmp );
    419                         impCpCtorExpr->get_dtors().push_front( makeCtorDtor( "^?{}", tmp ) );
     421                        impCpCtorExpr->tempDecls.push_back( tmp );
     422                        impCpCtorExpr->dtors.push_front( makeCtorDtor( "^?{}", tmp ) );
    420423                }
    421424
     
    427430                        CP_CTOR_PRINT( std::cerr << "ResolveCopyCtors: " << impCpCtorExpr << std::endl; )
    428431
    429                         ApplicationExpr * appExpr = impCpCtorExpr->get_callExpr();
     432                        ApplicationExpr * appExpr = impCpCtorExpr->callExpr;
    430433
    431434