Index: doc/bibliography/pl.bib
===================================================================
--- doc/bibliography/pl.bib	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ doc/bibliography/pl.bib	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -990,4 +990,16 @@
 }
 
+@techreport{cfa-cc,
+    keywords	= {Cforall, cfa-cc, transpiler},
+    contributer	= {pabuhr@plg},
+    title	= {{\textsf{cfa-cc}} Developer's Reference Manual},
+    author	= {Fangren Yu},
+    institution	= {School of Computer Science},
+    address	= {University of Waterloo, Waterloo, Ontario, Canada},
+    month	= aug,
+    year	= {2020},
+    note	= {\href{https://cforall.uwaterloo.ca/doc/Fangren_Yu_Report_S20.pdf}{https://\-cforall.uwaterloo.ca/\-doc/\-Fangren\_Yu\_Report\_S20.pdf}},
+}
+
 @article{Moss18,
     keywords	= {type systems, polymorphism, tuples, Cforall},
@@ -1040,5 +1052,5 @@
     keywords 	= {type system, generic type, resolution algorithm, type environment, Cforall},
     author	= {Aaron Moss},
-    title	= {\textsf{C}$\mathbf{\forall}$ Type System Implementation},
+    title	= {\textsf{C}\,$\mathbf{\forall}$ Type System Implementation},
     school	= {School of Computer Science, University of Waterloo},
     year	= 2019,
@@ -1161,8 +1173,8 @@
     keywords	= {ctrie, concurrent map},
     contributer = {a3moss@uwaterloo.ca}, 
-    title	={Cache-aware lock-free concurrent hash tries},
-    author	={Prokopec, Aleksandar and Bagwell, Phil and Odersky, Martin},
-    institution	={EPFL},
-    year	={2011}
+    title	= {Cache-aware lock-free concurrent hash tries},
+    author	= {Prokopec, Aleksandar and Bagwell, Phil and Odersky, Martin},
+    institution	= {EPFL},
+    year	= {2011}
 }
 
@@ -3928,5 +3940,5 @@
     school	= {School of Computer Science, University of Waterloo},
     year	= 2003,
-    address	= {Waterloo, Ontario, Canada, N2L 3G1},
+    optaddress	= {Waterloo, Ontario, Canada, N2L 3G1},
     note	= {\href{http://plg.uwaterloo.ca/theses/BilsonThesis.pdf}{http://\-plg.uwaterloo.ca/\-theses/\-BilsonThesis.pdf}},
 }
@@ -5210,4 +5222,14 @@
 }
 
+@manual{gcc-nested-func,
+    keywords	= {gcc nested functions},
+    contributer	= {pabuhr@plg},
+    key		= {gcc nested functions},
+    title	= {Nested Functions},
+    organization= {{gcc} 9.3 Manual},
+    year	= 2019,
+    note	= {\href{https://gcc.gnu.org/onlinedocs/gcc-9.3.0/gcc/Nested-Functions.html}{https://\-gcc.gnu.org/\-onlinedocs/\-gcc-9.3.0/\-gcc/\-Nested-Functions.html}},
+}
+
 @article{Haddon77,
     keywords	= {monitors, nested monitor calls},
Index: doc/theses/andrew_beach_MMath/existing.tex
===================================================================
--- doc/theses/andrew_beach_MMath/existing.tex	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ doc/theses/andrew_beach_MMath/existing.tex	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -83,6 +83,6 @@
 the the call site.
 
-As an example, even if no function named \codeCFA{do\_once} is not defined
-near the definition of \codeCFA{do\_twice} the following code will work.
+As an example, even if no function named \codeCFA{do_once} is not defined
+near the definition of \codeCFA{do_twice} the following code will work.
 \begin{lstlisting}
 int quadruple(int x) {
@@ -95,8 +95,8 @@
 \end{lstlisting}
 This is not the recommended way to implement a quadruple function but it
-does work. The complier will deduce that \codeCFA{do\_twice}'s T is an
+does work. The complier will deduce that \codeCFA{do_twice}'s T is an
 integer from the argument. It will then look for a definition matching the
-assertion which is the \codeCFA{do\_once} defined within the function. That
-function will be passed in as a function pointer to \codeCFA{do\_twice} and
+assertion which is the \codeCFA{do_once} defined within the function. That
+function will be passed in as a function pointer to \codeCFA{do_twice} and
 called within it.
 
@@ -156,5 +156,5 @@
 In \CFA coroutines are created using the \codeCFA{coroutine} keyword which
 works just like \codeCFA{struct} except that the created structure will be
-modified by the compiler to satify the \codeCFA{is\_coroutine} trait.
+modified by the compiler to satify the \codeCFA{is_coroutine} trait.
 
 These structures act as the interface between callers and the coroutine,
Index: doc/theses/andrew_beach_MMath/implement.tex
===================================================================
--- doc/theses/andrew_beach_MMath/implement.tex	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
+++ doc/theses/andrew_beach_MMath/implement.tex	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -0,0 +1,480 @@
+\chapter{Implementation}
+% Goes over how all the features are implemented.
+
+\section{Virtual System}
+% Virtual table rules. Virtual tables, the pointer to them and the cast.
+The \CFA virtual system only has one public facing feature: virtual casts.
+However there is a lot of structure to support that and provide some other
+features for the standard library.
+
+All of this is accessed through a field inserted at the beginning of every
+virtual type. Currently it is called \codeC{virtual_table} but it is not
+ment to be accessed by the user. This field is a pointer to the type's
+virtual table instance. It is assigned once during the object's construction
+and left alone after that.
+
+\subsection{Virtual Table Construction}
+For each virtual type a virtual table is constructed. This is both a new type
+and an instance of that type. Other instances of the type could be created
+but the system doesn't use them. So this section will go over the creation of
+the type and the instance.
+
+Creating the single instance is actually very important. The address of the
+table acts as the unique identifier for the virtual type. Similarly the first
+field in every virtual table is the parent's id; a pointer to the parent
+virtual table instance.
+
+The remaining fields contain the type's virtual members. First come the ones
+present on the parent type, in the same order as they were the parent, and
+then any that this type introduces. The types of the ones inherited from the
+parent may have a slightly modified type, in that references to the
+dispatched type are replaced with the current virtual type. These are always
+taken by pointer or reference.
+
+The structure itself is created where the virtual type is created. The name
+of the type is created by mangling the name of the base type. The name of the
+instance is also generated by name mangling.
+
+The fields are initialized automatically.
+The parent field is initialized by getting the type of the parent field and
+using that to calculate the mangled name of the parent's virtual table type.
+There are two special fields that are included like normal fields but have
+special initialization rules: the \codeC{size} field is the type's size and is
+initialized with a sizeof expression, the \codeC{align} field is the type's
+alignment and uses an alignof expression. The remaining fields are resolved
+to a name matching the field's name and type using the normal visibility
+and overload resolution rules of the type system.
+
+These operations are split up into several groups depending on where they
+take place which can vary for monomorphic and polymorphic types. The first
+devision is between the declarations and the definitions. Declarations, such
+as a function signature or a structure's name, must always be visible but may
+be repeated so they go in headers. Definitions, such as function bodies and a
+structure's layout, don't have to be visible on use but must occur exactly
+once and go into source files.
+
+The declarations include the virtual type definition and forward declarations
+of the virtual table instance, constructor, message function and
+\codeCFA{get_exception_vtable}. The definition includes the storage and
+initialization of the virtual table instance and the bodies of the three
+functions.
+
+Monomorphic instances put all of these two groups in one place each.
+
+Polymorphic instances also split out the core declarations and definitions
+from the per-instance information. The virtual table type and most of the
+functions are polymorphic so they are all part of the core. The virtual table
+instance and the \codeCFA{get_exception_vtable} function.
+
+Coroutines and threads need instances of \codeCFA{CoroutineCancelled} and
+\codeCFA{ThreadCancelled} respectively to use all of their functionality.
+When a new data type is declared with \codeCFA{coroutine} or \codeCFA{thread}
+the forward declaration for the instance is created as well. The definition
+of the virtual table is created at the definition of the main function.
+
+\subsection{Virtual Cast}
+Virtual casts are implemented as a function call that does the check and a
+old C-style cast to do the type conversion. The C-cast is just to make sure
+the generated code is correct so the rest of the section is about that
+function.
+
+The function is \codeC{__cfa__virtual_cast} and it is implemented in the
+standard library. It takes a pointer to the target type's virtual table and
+the object pointer being cast. The function is very simple, getting the
+object's virtual table pointer and then checking to see if it or any of
+its ancestors, by using the parent pointers, are the same as the target type
+virtual table pointer. It does this in a simple loop.
+
+For the generated code a forward decaration of the virtual works as follows.
+There is a forward declaration of \codeC{__cfa__virtual_cast} in every cfa
+file so it can just be used. The object argument is the expression being cast
+so that is just placed in the argument list.
+
+To build the target type parameter the compiler will create a mapping from
+concrete type-name -- so for polymorphic types the parameters are filled in
+-- to virtual table address. Every virtual table declaraction is added to the
+this table; repeats are ignored unless they have conflicting definitions.
+This does mean the declaractions have to be in scope, but they should usually
+be introduced as part of the type definition.
+
+\section{Exceptions}
+% Anything about exception construction.
+
+\section{Unwinding}
+% Adapt the unwind chapter, just describe the sections of libunwind used.
+% Mention that termination and cancellation use it. Maybe go into why
+% resumption doesn't as well.
+
+Many modern languages work with an interal stack that function push and pop
+their local data to. Stack unwinding removes large sections of the stack,
+often across functions.
+
+At a very basic level this can be done with \codeC{setjmp} \& \codeC{longjmp}
+which simply move the top of the stack, discarding everything on the stack
+above a certain point. However this ignores all the clean-up code that should
+be run when certain sections of the stack are removed (for \CFA these are from
+destructors and finally clauses) and also requires that the point to which the
+stack is being unwound is known ahead of time. libunwind is used to address
+both of these problems.
+
+Libunwind, provided in \texttt{unwind.h} on most platorms, is a C library
+that provides \CPP style stack unwinding. Its operation is divided into two
+phases. The search phase -- phase 1 -- is used to scan the stack and decide
+where the unwinding will stop, this allows for a dynamic target. The clean-up
+phase -- phase 2 -- does the actual unwinding and also runs any clean-up code
+as it goes.
+
+To use the libunwind each function must have a personality function and an
+LSDA (Language Specific Data Area). Libunwind actually does very little, it
+simply moves down the stack from function to function. Most of the actions are
+implemented by the personality function which libunwind calls on every
+function. Since this is shared across many functions or even every function in
+a language it will need a bit more information. This is provided by the LSDA
+which has the unique information for each function.
+
+Theoretically the LSDA can contain anything but conventionally it is a table
+with entries reperenting areas of the function and what has to be done there
+during unwinding. These areas are described in terms of where the instruction
+pointer is. If the current value of the instruction pointer is between two
+values reperenting the beginning and end of a region then execution is
+currently being executed. These are used to mark out try blocks and the
+scopes of objects with destructors to run.
+
+GCC will generate an LSDA and attach its personality function with the
+\texttt{-fexceptions} flag. However this only handles the cleanup attribute.
+This attribute is used on a variable and specifies a function that should be
+run when the variable goes out of scope. The function is passed a pointer to
+the object as well so it can be used to mimic destructors. It however cannot
+be used to mimic try statements.
+
+\subsection{Implementing Personality Functions}
+Personality functions have a complex interface specified by libunwind.
+This section will cover some of the important parts of that interface.
+
+\begin{lstlisting}
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
+    int version,
+    _Unwind_Action action,
+    _Unwind_Exception_Class exception_class,
+    _Unwind_Exception * exception,
+    struct _Unwind_Context * context);
+\end{lstlisting}
+
+The return value, the reason code, is an enumeration of possible messages
+that can be passed several places in libunwind. It includes a number of
+messages for special cases (some of which should never be used by the
+personality function) and error codes but unless otherwise noted the
+personality function should always return \codeC{_URC_CONTINUE_UNWIND}.
+
+The \codeC{version} argument is the verson of the implementation that is
+calling the personality function. At this point it appears to always be 1 and
+it will likely stay that way until a new version of the API is updated.
+
+The \codeC{action} argument is set of flags that tell the personality
+function when it is being called and what it must do on this invocation.
+The flags are as follows:
+\begin{itemize}
+\item\codeC{_UA_SEARCH_PHASE}: This flag is set whenever the personality
+function is called during the search phase. The personality function should
+decide if unwinding will stop in this function or not. If it does then the
+personality function should return \codeC{_URC_HANDLER_FOUND}.
+\item\codeC{_UA_CLEANUP_PHASE}: This flag is set whenever the personality
+function is called during the cleanup phase. If no other flags are set this
+means the entire frame will be unwound and all cleanup code should be run.
+\item\codeC{_UA_HANDLER_FRAME}: This flag is set during the cleanup phase
+on the function frame that found the handler. The personality function must
+prepare to return to normal code execution and return
+\codeC{_URC_INSTALL_CONTEXT}.
+\item\codeC{_UA_FORCE_UNWIND}: This flag is set if the personality function
+is called through a forced unwind call. Forced unwind only performs the
+cleanup phase and uses a different means to decide when to stop. See its
+section below.
+\end{itemize}
+
+The \codeC{exception_class} argument is a copy of the \codeC{exception}'s
+\codeC{exception_class} field.
+
+The \codeC{exception} argument is a pointer to the user provided storage
+object. It has two public fields, the exception class which is actually just
+a number that identifies the exception handling mechanism that created it and
+the other is the clean-up function. The clean-up function is called if the
+exception needs to 
+
+The \codeC{context} argument is a pointer to an opaque type. This is passed
+to the many helper functions that can be called inside the personality
+function.
+
+\subsection{Raise Exception}
+This could be considered the central function of libunwind. It preforms the
+two staged unwinding the library is built around and most of the rest of the
+interface of libunwind is here to support it. It's signature is as follows:
+
+\begin{lstlisting}
+_Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *);
+\end{lstlisting}
+
+When called the function begins the search phase, calling the personality
+function of the most recent stack frame. It will continue to call personality
+functions traversing the stack new-to-old until a function finds a handler or
+the end of the stack is reached. In the latter case raise exception will
+return with \codeC{_URC_END_OF_STACK}.
+
+Once a handler has been found raise exception continues onto the the cleanup
+phase. Once again it will call the personality functins of each stack frame
+from newest to oldest. This pass will stop at the stack frame that found the
+handler last time, if that personality function does not install the handler
+it is an error.
+
+If an error is encountered raise exception will return either
+\codeC{_URC_FATAL_PHASE1_ERROR} or \codeC{_URC_FATAL_PHASE2_ERROR} depending
+on when the error occured.
+
+\subsection{Forced Unwind}
+This is the second big function in libunwind. It also unwinds a stack but it
+does not use the search phase. Instead another function, the stop function,
+is used to decide when to stop.
+
+\begin{lstlisting}
+_Unwind_Reason_Code _Unwind_ForcedUnwind(
+    _Unwind_Exception *, _Unwind_Stop_Fn, void *);
+\end{lstlisting}
+
+The exception is the same as the one passed to raise exception. The extra
+arguments are the stop function and the stop parameter. The stop function has
+a similar interface as a personality function, except it is also passed the
+stop parameter.
+
+\begin{lstlisting}
+typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(
+    int version,
+    _Unwind_Action action,
+    _Unwind_Exception_Class exception_class,
+    _Unwind_Exception * exception,
+    struct _Unwind_Context * context,
+    void * stop_parameter);
+\end{lstlisting}
+
+The stop function is called at every stack frame before the personality
+function is called and then once more once after all frames of the stack have
+been unwound.
+
+Each time it is called the stop function should return \codeC{_URC_NO_REASON}
+or transfer control directly to other code outside of libunwind. The
+framework does not provide any assistance here.
+
+Its arguments are the same as the paired personality function.
+The actions \codeC{_UA_CLEANUP_PHASE} and \codeC{_UA_FORCE_UNWIND} are always
+set when it is called. By the official standard that is all but both GCC and
+Clang add an extra action on the last call at the end of the stack:
+\codeC{_UA_END_OF_STACK}.
+
+\section{Exception Context}
+% Should I have another independent section?
+% There are only two things in it, top_resume and current_exception. How it is
+% stored changes depending on wheither or not the thread-library is linked.
+
+The exception context is a piece of global storage used to maintain data
+across different exception operations and to communicate between different
+components.
+
+Each stack has its own exception context. In a purely sequental program, using
+only core Cforall, there is only one stack and the context is global. However
+if the library \texttt{libcfathread} is linked then there can be multiple
+stacks so they will each need their own.
+
+To handle this code always gets the exception context from the function
+\codeC{this_exception_context}. The main exception handling code is in
+\texttt{libcfa} and that library also defines the function as a weak symbol
+so it acts as a default. Meanwhile in \texttt{libcfathread} the function is
+defined as a strong symbol that replaces it when the libraries are linked
+together.
+
+The version of the function defined in \texttt{libcfa} is very simple. It
+returns a pointer to a global static variable. With only one stack this
+global instance is associated with the only stack.
+
+The version of the function defined in \texttt{libcfathread} has to handle
+more as there are multiple stacks. The exception context is included as
+part of the per-stack data stored as part of coroutines. In the cold data
+section, stored at the base of each stack, is the exception context for that
+stack. The \codeC{this_exception_context} uses the concurrency library to get
+the current coroutine and through it the cold data section and the exception
+context.
+
+\section{Termination}
+% Memory management & extra information, the custom function used to implement
+% catches. Talk about GCC nested functions.
+
+Termination exceptions use libunwind quite heavily because it matches the
+intended use from \CPP exceptions very closely. The main complication is that
+since the \CFA compiler works by translating to C code it cannot generate the
+assembly to form the LSDA for try blocks or destructors.
+
+\subsection{Memory Management}
+The first step of termination is to copy the exception into memory managed by
+the exception system. Currently the system just uses malloc, without reserved
+memory or and ``small allocation" optimizations. The exception handling
+mechanism manages memory for the exception as well as memory for libunwind
+and the system's own per-exception storage.
+
+Exceptions are stored in variable sized block. The first component is a fixed
+sized data structure that contains the information for libunwind and the
+exception system. The second component is a blob of memory that is big enough
+to store the exception. Macros with pointer arthritic and type cast are
+used to move between the components or go from the embedded
+\codeC{_Unwind_Exception} to the entire node.
+
+All of these nodes are strung together in a linked list. One linked list per
+stack, with the head stored in the exception context. Within each linked list
+the most recently thrown exception is at the head and the older exceptions
+are further down the list. This list format allows exceptions to be thrown
+while a different exception is being handled. Only the exception at the head
+of the list is currently being handled, the other will wait for the
+exceptions before them to be removed.
+
+The virtual members in the exception's virtual table. The size of the
+exception, the copy function and the free function are all in the virtual
+table so they are decided per-exception type. The size and copy function are
+used right away when the exception is copied in to managed memory. After the
+exception is handled the free function is used to clean up the exception and
+then the entire node is passed to free.
+
+\subsection{Try Statements \& Catch Clauses}
+The try statements with termination handlers have a pretty complex conversion
+to compensate for the lack of assembly generation. Libunwind requires an LSDA
+(Language Specific Data Area) and personality function for a function to
+unwind across it. The LSDA in particular is hard to generate at the level of
+C which is what the \CFA compiler outputs so a work-around is used.
+
+This work around is a function called \codeC{__cfaehm_try_terminate} in the
+standard library. The contents of a try block and the termination handlers
+are converted into functions. These are then passed to the try terminate
+function and it calls them. This puts the try statements in their own
+functions so that no function has to deal with both termination handlers and
+destructors.
+
+This function has some custom embedded assembly that defines its personality
+function and LSDA. This is hand coded in C which is why there is only one
+version of it, the compiler has no capability to generate it. The personality
+function is structured so that it may be expanded, but really it only handles
+this one function. Notably it does not handle any destructors so the function
+is constructed so that it does need to run it.
+
+The three functions passed to try terminate are:
+\begin{itemize}
+\item The try function: This function is the try block, all the code inside
+the try block is placed inside the try function. It takes no parameters and
+has no return value. This function is called during regular execution to run
+the try block.
+\item The match function: This function decides if this try statement should
+handle any given termination exception. It takes a pointer to the exception
+and returns 0 if the exception is not handled here. Otherwise the return value
+is the id of the handler that should handle the exception. It is called
+during the search phase.
+It is constructed from the conditional part of each handler. It runs each
+check in turn, first checking to see if the object
+\item The catch function: This function handles the exception. It takes a
+pointer to the exception and the handler's id and returns nothing. It is
+called after the clean-up phase.
+It is constructed by stitching together the bodies of each handler
+\end{itemize}
+All three are created with GCC nested functions. GCC nested functions can be
+used to create closures, functions that can refer to the state of other
+functions on the stack. This allows the functions to refer to the main
+function and all the variables in scope.
+
+These nested functions and all other functions besides
+\codeC{__cfaehm_try_terminate} in \CFA use the GCC personality function and
+the \texttt{-fexceptions} flag to generate the LSDA. This allows destructors
+to be implemented with the cleanup attribute.
+
+\section{Resumption}
+% The stack-local data, the linked list of nodes.
+
+Resumption uses a list of nodes for its stack traversal. The head of the list
+is stored in the exception context. The nodes in the list just have a pointer
+to the next node and a pointer to the handler function.
+
+The on a resumption throw the this list is traversed. At each node the
+handler function is called and is passed the exception by pointer. It returns
+true if the exception was handled and false otherwise.
+
+The handler function does both the matching and catching. It tries each
+the condition of \codeCFA{catchResume} in order, top-to-bottom and until it
+finds a handler that matches. If no handler matches then the function returns
+false. Otherwise the matching handler is run, if it completes successfully
+the function returns true. Rethrows, through the \codeCFA{throwResume;}
+statement, cause the function to return true.
+
+\subsection{Libunwind Compatibility}
+Resumption does not use libunwind for two simple reasons. The first is that
+it does not have to unwind anything so would never need to use the clean-up
+phase. Still the search phase could be used to make it free to enter or exit
+a try statement with resumption handlers in the same way termination handlers
+are for the same trade off in the cost of the throw. This is where the second
+reason comes in, there is no way to return from a search without installing
+a handler or raising an error.
+
+Although work arounds could be created none seemed to be worth it for the
+prototype. This implementation has no difference in behaviour and is much
+simpler.
+% Seriously, just compare the size of the two chapters and then consider
+% that unwind is required knowledge for that chapter.
+
+\section{Finally}
+% Uses destructors and GCC nested functions.
+Finally clauses are a simple decomposition to some of the existing features.
+The code in the block is placed into a GCC nested function with a unique name,
+no arguments or return values. This nested function is then set as the
+clean-up function of an empty object that is declared at the beginning of a
+block placed around the contexts of the try statement.
+
+The rest is handled by GCC. The try block and all handlers are inside the
+block. When they are complete control exits the block and the empty object
+is cleaned up, which runs the function that contains the finally code.
+
+\section{Cancellation}
+% Stack selections, the three internal unwind functions.
+
+Cancellation also uses libunwind to do its stack traversal and unwinding,
+however it uses a different primary function \codeC{_Unwind_ForcedUnwind}.
+Details of its interface can be found in the unwind section.
+
+The first step of cancellation is to find the stack was cancelled and which
+type of stack it is. Luckily the threads library stores the main thread
+pointer and the current thread pointer and every thread stores a pointer to
+its main coroutine and the coroutine it is currently executing.
+
+So if the the current thread's main and current coroutine do not match, it is
+a coroutine cancellation. Otherwise if the main and current thread do not
+match, it is a thread cancellation. Otherwise it is a main thread
+cancellation.
+
+However if the threading library is not linked then execution must be on the
+main stack as that is the only one that exists. So the entire check is skipped
+using the linker and weak symbols. Instead the main thread cancellation is
+unconditionally preformed.
+
+Regardless of how they are choosen afterwords the stop function and the stop
+parameter are passed to the forced unwind functon. The general pattern of all
+three stop functions is the same, they continue unwinding until the end of
+stack when they do there primary work.
+
+Main stack cancellation it is very simple. The ``transfer" is just an abort,
+the program stops executing.
+
+The coroutine cancellation stores the exception on the coroutine and then
+does a coroutine context switch. The rest is handled inside resume. Every time
+control returns from a resumed thread there is a check to see if it is
+cancelled. If it is the exception is retrieved and the CoroutineCancelled
+exception is constructed and loaded. It is then thrown as a regular exception
+with the default handler coming from the context of the resumption call.
+
+The thread cancellation stores the exception on the thread's main stack and
+then returns to the scheduler. The rest is handled by the joiner. The wait
+for the joined thread to finish works the same but after that it checks
+to see if there was a cancellation. If there was the exception is retrieved
+and the ThreadCancelled exception is constructed. The default handler is
+passed in as a function pointer. If it is null (as it is for the
+auto-generated joins on destructor call) it a default is used that simply
+calls abort; which gives the required handling on implicate join.
Index: doc/theses/fangren_yu_COOP_F20/Makefile
===================================================================
--- doc/theses/fangren_yu_COOP_F20/Makefile	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
+++ doc/theses/fangren_yu_COOP_F20/Makefile	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -0,0 +1,87 @@
+## Define the configuration variables.
+
+Build = build
+Figures = figures
+Macros = ../../LaTeXmacros
+TeXLIB = .:${Macros}:${Build}:
+LaTeX  = TEXINPUTS=${TeXLIB} && export TEXINPUTS && latex -halt-on-error -output-directory=${Build}
+BibTeX = BIBINPUTS=../../bibliography: && export BIBINPUTS && bibtex
+
+MAKEFLAGS = --no-print-directory --silent #
+VPATH = ${Build} ${Figures}
+
+## Define the text source files.
+
+SOURCES = ${addsuffix .tex, \
+Report \
+}
+
+FIGURES = ${addsuffix .tex, \
+}
+
+PICTURES = ${addsuffix .pstex, \
+}
+
+PROGRAMS = ${addsuffix .tex, \
+}
+
+GRAPHS = ${addsuffix .tex, \
+}
+
+## Define the documents that need to be made.
+
+DOCUMENT = Report.pdf
+BASE = ${basename ${DOCUMENT}}
+
+# Directives #
+
+.PHONY : all clean					# not file names
+
+all : ${DOCUMENT}
+
+clean :
+	@rm -frv ${DOCUMENT} ${BASE}.ps ${Build}
+
+# File Dependencies #
+
+${DOCUMENT} : ${BASE}.ps
+	ps2pdf $<
+
+${BASE}.ps : ${BASE}.dvi
+	dvips ${Build}/$< -o $@
+
+${BASE}.dvi : Makefile ${GRAPHS} ${PROGRAMS} ${PICTURES} ${FIGURES} ${SOURCES} \
+		${Macros}/common.tex ${Macros}/lstlang.sty ${Macros}/indexstyle ../../bibliography/pl.bib | ${Build}
+	# Conditionally create an empty *.ind (index) file for inclusion until makeindex is run.
+	if [ ! -r ${basename $@}.ind ] ; then touch ${Build}/${basename $@}.ind ; fi
+	# Must have *.aux file containing citations for bibtex
+	if [ ! -r ${basename $@}.aux ] ; then ${LaTeX} ${basename $@}.tex ; fi
+	-${BibTeX} ${Build}/${basename $@}
+	# Some citations reference others so run again to resolve these citations
+	${LaTeX} ${basename $@}.tex
+	-${BibTeX} ${Build}/${basename $@}
+	# Make index from *.aux entries and input index at end of document
+	makeindex -s ${Macros}/indexstyle ${Build}/${basename $@}.idx
+	# Run again to finish citations
+	${LaTeX} ${basename $@}.tex
+	# Run again to get index title into table of contents
+	${LaTeX} ${basename $@}.tex
+
+## Define the default recipes.
+
+${Build} :
+	mkdir -p ${Build}
+
+%.tex : %.fig | ${Build}
+	fig2dev -L eepic $< > ${Build}/$@
+
+%.ps : %.fig | ${Build}
+	fig2dev -L ps $< > ${Build}/$@
+
+%.pstex : %.fig | ${Build}
+	fig2dev -L pstex $< > ${Build}/$@
+	fig2dev -L pstex_t -p ${Build}/$@ $< > ${Build}/$@_t
+
+# Local Variables: #
+# compile-command: "make" #
+# End: #
Index: doc/theses/fangren_yu_COOP_F20/Report.tex
===================================================================
--- doc/theses/fangren_yu_COOP_F20/Report.tex	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
+++ doc/theses/fangren_yu_COOP_F20/Report.tex	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -0,0 +1,525 @@
+\documentclass[twoside,12pt]{article}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+% Latex packages used in the document.
+\usepackage{fullpage,times,comment}
+\usepackage{epic,eepic}
+\usepackage{upquote}									% switch curled `'" to straight
+\usepackage{calc}
+\usepackage{varioref}									% extended references
+\usepackage[labelformat=simple,aboveskip=0pt,farskip=0pt]{subfig}
+\renewcommand{\thesubfigure}{\alph{subfigure})}
+\usepackage[flushmargin]{footmisc}						% support label/reference in footnote
+\usepackage{latexsym}                                   % \Box glyph
+\usepackage{mathptmx}                                   % better math font with "times"
+\usepackage[toc]{appendix}								% article does not have appendix
+\usepackage[usenames]{color}
+\input{common}                                          % common CFA document macros
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage{breakurl}
+\urlstyle{sf}
+
+
+% reduce spacing
+\setlist[itemize]{topsep=5pt,parsep=0pt}% global
+\setlist[enumerate]{topsep=5pt,parsep=0pt}% global
+
+\usepackage[pagewise]{lineno}
+\renewcommand{\linenumberfont}{\scriptsize\sffamily}
+
+% Default underscore is too low and wide. Cannot use lstlisting "literate" as replacing underscore
+% removes it as a variable-name character so keywords in variables are highlighted. MUST APPEAR
+% AFTER HYPERREF.
+\renewcommand{\textunderscore}{\leavevmode\makebox[1.2ex][c]{\rule{1ex}{0.075ex}}}
+\newcommand{\NOTE}{\textbf{NOTE}}
+\newcommand{\TODO}[1]{{\color{Purple}#1}}
+
+\setlength{\topmargin}{-0.45in}							% move running title into header
+\setlength{\headsep}{0.25in}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\CFAStyle												% CFA code-style for all languages
+%\lstset{
+%language=C++,moredelim=**[is][\color{red}]{@}{@}		% make C++ the default language
+%}% lstset
+%\lstnewenvironment{C++}[1][]                            % use C++ style
+%{\lstset{language=C++,moredelim=**[is][\color{red}]{@}{@}}\lstset{#1}}{}
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\setcounter{secnumdepth}{3}                             % number subsubsections
+\setcounter{tocdepth}{3}                                % subsubsections in table of contents
+\makeindex
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\title{\LARGE
+Optimization of \CFA Compiler with Case Studies
+}% title
+
+\author{\LARGE
+Fangren Yu -- University of Waterloo
+}% author
+
+\date{
+\today
+}% date
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+\begin{document}
+\pagestyle{headings}
+% changed after setting pagestyle
+\renewcommand{\sectionmark}[1]{\markboth{\thesection\quad #1}{\thesection\quad #1}}
+\renewcommand{\subsectionmark}[1]{\markboth{\thesubsection\quad #1}{\thesubsection\quad #1}}
+\pagenumbering{roman}
+\linenumbers                                            % comment out to turn off line numbering
+
+\maketitle
+\pdfbookmark[1]{Contents}{section}
+\tableofcontents
+
+\clearpage
+\thispagestyle{plain}
+\pagenumbering{arabic}
+
+\begin{abstract}
+\end{abstract}
+
+\section{Introduction}
+
+\section{Completed work}
+
+\subsection{Memory model with sharing}
+
+A major rework of the abstract syntax tree (AST) data structure in the compiler is completed as the first step of the project. The majority of work were documented in the reference manual of the compiler~\cite{cfa-cc}. To summarize:
+\begin{itemize}
+\item
+AST nodes (and therefore subtrees) can be shared without copying when reused.
+\item
+Modifications apply the functional programming principle, making copies for local changes without affecting the original data shared by other owners. In-place mutations are permitted as a special case when sharing does not happen. The logic is implemented by reference counting.
+\item
+Memory allocation and freeing are performed automatically using smart pointers.
+\end{itemize}
+The resolver algorithm designed for overload resolution naturally introduces a significant amount of reused intermediate representations, especially in the following two places:
+\begin{itemize}
+\item
+Function overload candidates are computed by combining the argument candidates bottom-up, with many of them being a common term. For example, if $n$ overloads of a function @f@ all take an integer for the first parameter but different types for the second (@f( int, int )@, @f( int, double )@, etc.) the first term is reused $n$ times for each of the generated candidate expressions. This effect is particularly bad for deep expression trees. 
+\item
+In the unification algorithm and candidate elimination step, actual types are obtained by substituting the type parameters by their bindings. Let $n$ be the complexity (\ie number of nodes in representation) of the original type, $m$ be the complexity of bound type for parameters, and $k$ be the number of occurrences of type parameters in the original type. If everything needs to be deep-copied, the substitution step takes $O(n+mk)$ time and memory, while using shared nodes it is reduced to $O(n)$ time and $O(k)$ memory.
+\end{itemize}
+One of the worst examples for the old compiler is a long chain of I/O operations
+\begin{cfa}
+sout | 1 | 2 | 3 | 4 | ...
+\end{cfa}
+The pipe operator is overloaded by \CFA I/O library for every primitive type in C language, as well as I/O manipulators defined by the library. In total there are around 50 overloads for the output stream operation. On resolving the $n$-th pipe operator in the sequence, the first term, which is the result of sub-expression containing $n-1$ pipe operators, is reused to resolve every overload. Therefore at least $O(n^2)$ copies of expression nodes are made during resolution, not even counting type unification cost; combined with two large factors from number of overloads of pipe operators, and that the ``output stream type'' in \CFA is a trait with 27 assertions (which adds to complexity of the pipe operator's type) this makes compiling a long output sequence extremely slow. In new AST representation only $O(n)$ copies are required and type of pipe operator is not copied at all.
+
+Reduction in space complexity is especially important, as preliminary profiling result on the old compiler build shows that over half of time spent in expression resolution are on memory allocations. 
+ 
+
+\subsection{Merged resolver calls}
+
+The pre-resolve phase of compilation, inadequately called ``validate'' in the compiler source code, does more than just simple syntax validation, as it also normalizes input program. Some of them, however, requires type information on expressions and therefore needs to call the resolver before the general resolve phase. There are three notable places where the resolver is invoked:
+\begin{itemize}
+\item
+Attempt to generate default constructor, copy constructor and destructor for user-defined @struct@ types
+\item
+Resolve @with@ statements (the same as in Python, which introduces fields of a structure directly in scope)
+\item
+Resolve @typeof@ expressions (cf. @decltype@ in \CC); note that this step may depend on symbols introduced by @with@ statements.
+\end{itemize}
+Since the compiler codebase is large and the new memory model mostly only benefits expression resolution, the old data structure is still kept, and a conversion pass happens before and after resolve phase. Rewriting every compiler module will take a long time, and whether the new model is correct is still unknown when started, therefore only the resolver is implemented with the new data structure. 
+
+Since the constructor calls were one of the most expensive to resolve (reason will be shown in the next section), pre-resolve phase were taking more time after resolver moves to the more efficient new implementation. To better facilitate the new resolver, every step that requires type information are reintegrated as part of resolver.
+
+A by-product of this work is that the reversed dependence of @with@ statement and @typeof@ can now be handled. Previously, the compiler is unable to handle cases such as
+\begin{cfa}
+struct S { int x; };
+S foo();
+typeof( foo() ) s; // type is S
+with (s) { 
+	x; // refers to s.x
+}
+\end{cfa}
+since type of @s@ is still unresolved when handling @with@ expressions. Instead, the new (and correct) approach is to evaluate @typeof@ expressions when the declaration is first seen, and it suffices because of the declaration-before-use rule.
+
+
+\subsection{Special function lookup}
+
+Reducing the number of functions looked up for overload resolution is an effective way to gain performance when there are many overloads but most of them are trivially wrong. In practice, most functions have few (if any) overloads but there are notable exceptions. Most importantly, constructor @?{}@, destructor @^?{}@, and assignment @?=?@ are generated for every user-defined type, and in a large source file there can be hundreds of them. Furthermore, many calls to them are generated for initializing variables and passing arguments. This fact makes them the most overloaded and most called functions.
+
+In an object-oriented programming language, object has methods declared with their types, so a call such as @obj.f()@ only needs to perform lookup in the method table corresponding to type of @obj@. \CFA on the other hand, does not have methods, and all types are open (\ie new operations can be defined on them), so a similar approach will not work in general. However, the ``big 3'' operators have a unique property enforced by the language rules, such that the first parameter must have a reference type. Since \CFA does not have class inheritance, reference type must always match exactly. Therefore, argument-dependent lookup can be implemented for these operators, by using a dedicated symbol table.
+
+The lookup key used for the special functions is the mangled type name of the first parameter, which acts as the @this@ parameter in an object-oriented language. To handle generic types, the type parameters are stripped off, and only the base type is matched. Note that a constructor (destructor, assignment operator) taking arbitrary @this@ argument, for example @forall( dtype T ) void ?{}( T & );@ is not allowed, and it guarantees that if the @this@ type is known, all possible overloads can be found by searching with the given type. In case that the @this@ argument itself is overloaded, it is resolved first and all possible result types are used for lookup.
+
+Note that for the generated expressions, the particular variable for @this@ argument is fully known, without overloads, so the majority of constructor call resolutions only need to check for one given object type. Explicit constructor calls and assignment statements sometimes may require lookup for multiple types. In the extremely rare case that type of @this@ argument is yet unbound, everything will have to be checked, just like without the argument-dependent lookup algorithm; fortunately, this case almost never happens in practice. An example is found in the library function @new@:
+\begin{cfa}
+forall( dtype T | sized( T ), ttype TT | { void ?{}( T &, TT ); } )
+T * new( TT p ) { return &(*malloc()){ p }; }
+\end{cfa}
+as @malloc@ may return a pointer to any type, depending on context. 
+
+Interestingly, this particular line of code actually caused another complicated issue, where the unusually massive work of checking every constructor in presence makes the case even worse. Section~\ref{s:TtypeResolutionInfiniteRecursion} presents a detailed analysis for the problem.
+
+The ``callable'' operator @?()@ (cf. @operator()@ in \CC) could also be included in the special operator list, as it is usually only on user-defined types, and the restriction that first argument must be a reference seems reasonable in this case. 
+
+
+\subsection{Improvement of function type representation}
+
+Since substituting type parameters with their bound types is one fundamental operation in many parts of resolver algorithm (particularly unification and environment binding), making as few copies of type nodes as possible helps reducing memory complexity. Even with the new memory management model, allocation is still a significant factor of resolver performance. Conceptually, operations on type nodes of AST should be performed in functional programming style, treating the data structure as immutable and only copy when necessary. The in-place mutation is a mere optimization that does not change logic of operations.
+The model was broken on function types by an inappropriate design. Function types require some special treatment due to the existence of assertions. In particular, it must be able to distinguish two different kinds of type parameter usage:
+\begin{cfa}
+forall( dtype T ) void foo( T * t ) {
+	forall( dtype U ) void bar( T * t, U * u ) { ... }
+}
+\end{cfa}
+Here, only @U@ is a free parameter in declaration of @bar@, as it appears in the function's own forall clause; while @T@ is not free. 
+
+Moreover, the resolution algorithm also has to distinguish type bindings of multiple calls to the same function, for example with
+\begin{cfa}
+forall( dtype T ) int foo( T x );
+foo( foo( 1.0 ) );
+\end{cfa}
+The inner call has binding (T: double) while the outer call has binding (T: int). Therefore a unique representation of free parameters in each expression is required. This was previously done by creating a copy of the parameter declarations inside function type, and fixing references afterwards. However, fixing references is an inherently deep operation that does not work well with functional programming model, as it must be evaluated eagerly on the entire syntax tree representing the function type.
+
+The revised approach generates a unique ID value for each function call expression instance and represents an occurrence of free parameter type with a pair of generated ID and the original parameter declaration, so that references do not need to be fixed, and a shallow copy of function type is possible. 
+
+Note that after the change, all declaration nodes in syntax tree representation maps one-to-one with the actual declarations in the program, and therefore are guaranteed to be unique. Such property can potentially enable more optimizations, and some related ideas are presented after Section~\ref{s:SharedSub-ExpressionCaseUniqueExpressions}.
+
+
+\subsection{Improvement of pruning steps}
+
+A minor improvement for candidate elimination is to skip the step on the function overloads themselves and only perform on results of function application. As function calls are usually by name, the name resolution rule dictates that every function candidate necessarily has a different type; indirect function calls are rare, and when they do appear, they usually will not have many possible interpretations, and those rarely matches exactly in argument type. Since function types have a much more complex representation than data types (with multiple parameters and assertions), checking equality on them also takes longer. 
+
+A brief test of this approach shows that the number of function overloads considered in expression resolution increases by a negligible amount of less than 1 percent, while type comparisons in candidate elimination are cut by more than half. Improvement is consistent over all \CFA source files in the test suite.
+
+
+\subsection{Shared sub-expression case: Unique expressions}
+\label{s:SharedSub-ExpressionCaseUniqueExpressions}
+
+Unique expression denotes an expression that must be evaluated only once, to prevent unwanted side effects. It is currently only a compiler artifact, generated on tuple member expression of the form
+\begin{cfa}
+struct S { int a; int b; };
+S s;
+s.[a, b]; // tuple member expression, type is [int, int]
+\end{cfa}
+If the aggregate expression contains function calls, it cannot be evaluated multiple times:
+\begin{cfa}
+S makeS();
+makeS().[a, b]; // this should only make one S
+\end{cfa}
+Before code generation, the above expression is internally represented as
+\begin{cfa}
+[uniqueExpr{makeS()}.a, uniqueExpr{makeS()}.b];
+\end{cfa}
+and the unique expression is expanded to$\footnote{This uses gcc's statement expression syntax, where \lstinline|(\{block\})| evaluates to value of last expression in the block.}$
+\begin{cfa}
+({
+	if ( !_unique_var_evaluated ) {
+		_unique_var_evaluated = true;
+		_unique_var = makeS();
+	}
+	_unique_var;
+})
+\end{cfa}
+at code generation, where @_unique_var@ and @_unique_var_evaluated@ are generated variables whose scope covers all appearances of the same expression.
+
+Note that although the unique expression is only used for tuple expansion now, it is a generally useful construction, and can be seen in other languages, such as Scala's @lazy val@~\cite{Scala}; therefore it could be worthwhile to introduce the unique expression to a broader context in \CFA and even make it directly available to programmers.
+
+In the compiler's visitor pattern, however, this creates a problem where multiple paths to a logically unique expression exist, so it may be modified more than once and become ill-formed; some specific intervention is required to ensure that unique expressions are only visited once. Furthermore, a unique expression appearing in more than one places will be copied on mutation so its representation is no longer unique. Some hacks are required to keep it in sync, and the methods are different when mutating the unique expression instance itself or its underlying expression.
+
+Example when mutating the underlying expression (visit-once guard)
+\begin{cfa}
+void InsertImplicitCalls::previsit( const ast::UniqueExpr * unqExpr ) {
+	if ( visitedIds.count( unqExpr->id ) ) visit_children = false;
+	else visitedIds.insert( unqExpr->id );
+}
+\end{cfa}
+Example when mutating the unique instance itself, which actually creates copies
+\begin{cfa}
+auto mutExpr = mutate( unqExpr ); // internally calls copy when shared
+if ( ! unqMap.count( unqExpr->id ) ) {
+	...
+} else {
+	mutExpr->expr = unqMap[mutExpr->id]->expr;
+	mutExpr->result = mutExpr->expr->result;
+}
+\end{cfa}
+Such workaround seems difficult to be fit into a common visitor template. This suggests the memory model may need different kinds of nodes to accurately represent the syntax tree.
+
+Together with the fact that declaration nodes are always unique, it is possible that AST nodes can be classified by three different types:
+\begin{itemize}
+\item
+\textbf{Strictly unique} with only one owner (declarations);
+\item
+\textbf{Logically unique} with (possibly) many owners but should not be copied (unique expression example presented here);
+\item
+\textbf{Shared} by functional programming model, which assume immutable data structure and are copied on mutation.
+\end{itemize}
+The boilerplate code can potentially handle these three cases differently.
+
+
+\section{Analysis of resolver algorithm complexity}
+
+The focus of this chapter is to identify and analyze some realistic cases that cause resolver algorithm to have an exponential run time. As previous work has shown [3], the overload resolution problem in \CFA has worst-case exponential complexity; however, only few specific patterns can trigger the exponential complexity in practice. Implementing heuristic-based optimization for those selected cases is helpful to alleviate the problem.
+
+
+\subsection{Unbound return type}
+\label{s:UnboundReturnType}
+
+The interaction of return type overloading and polymorphic functions creates this problem of function calls with unbound return type, and is further complicated by the presence of assertions.
+The prime example of a function with unbound return type is the type-safe version of C @malloc@:
+\begin{cfa}
+// size deduced from type, so no need to provide the size argument
+forall( dtype T | sized( T ) ) T * malloc( void ); 
+\end{cfa}
+Unbound return type can be problematic in resolver algorithm complexity because a single match of function call with unbound return type may create multiple candidates. In the worst case, consider a function declared to return any @otype@:
+\begin{cfa}
+forall( otype T ) T anyObj( void );
+\end{cfa}
+As the resolver attempts to satisfy the otype constraint on @T@, a single call to @anyObj()@ without the result type known creates at least as many candidates as the number of complete types currently in scope; with generic types it becomes even worse, for example, assuming a declaration of generic pair is available at that point:
+\begin{cfa}
+forall( otype T, otype U ) struct pair { T first; U second; };
+\end{cfa}
+Then an @anyObj()@ call can result in arbitrarily complex types, such as @pair( pair( int,int ), pair( int,int ) )@, and the depth can grow indefinitely until the specified parameter depth limit, thus creating exponentially many candidates. However, the expected types allowed by parent expressions are practically very few, so most of those interpretations are invalid; if the result type is never bound up to top level, by the semantic rules it is ambiguous if there are more than one valid bindings, and resolution can fail fast. It is therefore reasonable to delay resolving assertions on an unbound parameter in return type; however, with the current cost model, such behavior may further cause irregularities in candidate selection, such that the presence of assertions can change the preferred candidate, even when order of expression costs are supposed to stay the same. Detailed analysis of this issue will be presented later, in the correctness part.
+
+
+\subsection[ttype resolution infinite recursion]{\lstinline|ttype| resolution infinite recursion}
+\label{s:TtypeResolutionInfiniteRecursion}
+
+@ttype@ (``tuple type'') is a relatively new addition to the language that attempts to provide type-safe variadic argument semantics. Unlike regular @dtype@ parameters, @ttype@ is only valid in function parameter list, and may only appear once as the type of last parameter. At the call site, a @ttype@ parameter is bound to the tuple type of all remaining function call arguments.
+
+There are two kinds of idiomatic @ttype@ usage: one is to provide flexible argument forwarding, similar to the variadic template in \CC (\lstinline[language=C++]|template<typename... args>|), as shown below in the implementation of @unique_ptr@
+\begin{cfa}
+forall( dtype T )
+struct unique_ptr {
+	T * data;
+};
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); })
+void ?{}( unique_ptr( T ) & this, Args args ) {
+	this.data = new( args );
+}
+\end{cfa}
+the other is to implement structural recursion in the first-rest manner:
+\begin{cfa}
+forall( otype T, ttype Params | { void process( T ); void func( Params ); })
+void func( T arg1, Params p ) {
+	process( arg1 );
+	func( p );
+}
+\end{cfa}
+For the second use case, it is important that the number of parameters in the recursive call go down, since the call site must deduce all assertion candidates, and that is only possible if by just looking at argument types (and not their values), the recursion is known to be completed in a finite number of steps.
+
+In recent experiments, however, some flaw in the type binding rules can lead to the first kind of @ttype@ use case produce an invalid candidate that the resolver enters an infinite loop.
+
+This bug was discovered in an attempt to raise assertion recursive depth limit and one of the library program takes exponentially longer time to compile. The cause of the problem is identified to be the following set of functions.
+File @memory.cfa@ contains
+\begin{cfa}
+#include "memory.hfa"
+#include "stdlib.hfa"
+\end{cfa}
+where file @memory.hfa@ contains the @unique_ptr@ declaration above, and two other similar functions with @ttype@ parameter:
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); }) {
+	void ?{}( counter_data( T ) & this, Args args );
+	void ?{}( counter_ptr( T ) & this, Args args );
+	void ?{}( unique_ptr( T ) & this, Args args );
+}
+\end{cfa}
+File @stdlib.hfa@ contains
+\begin{cfa}
+forall( dtype T | sized( T ), ttype TT | { void ?{}( T &, TT ); } )
+T * new( TT p ) { return &(*malloc()){ p }; }
+\end{cfa}
+
+In the expression @(*malloc()){p}@, the type of object being constructed is yet unknown, since the return type information is not immediately provided. That caused every constructor to be searched, and while normally a bound @ttype@ cannot be unified with any free parameter, it is possible with another free @ttype@. Therefore in addition to the correct option provided by assertion, 3 wrong options are examined, each of which again requires the same assertion, for an unknown base type T and @ttype@ arguments, and that becomes an infinite loop, until the specified recursion limit and resolution is forced to fail. Moreover, during the recursion steps, number of candidates grows exponentially, since there are always 3 options at each step.
+
+Unfortunately, @ttype@ to @ttype@ binding is necessary, to allow calling the function provided by assertion indirectly.
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); })
+void ?{}( unique_ptr( T ) & this, Args args ) { this.data = (T * )new( args ); }
+\end{cfa}
+Here the constructor assertion is used for the @new( args )@ call.
+Therefore, it is hard, perhaps impossible, to solve this problem by tweaking the type binding rules. An assertion caching algorithm can help improve this case by detecting cycles in recursion.
+
+Meanwhile, without the caching algorithm implemented, some changes in the \CFA source code are enough to eliminate this problem, at least in the current codebase. Note that the issue only happens with an overloaded variadic function, which rarely appears in practice, since the idiomatic use cases are for argument forwarding and self-recursion. The only overloaded @ttype@ function so far discovered in all of \CFA standard library code is the constructor, and by utilizing the argument-dependent lookup process described in Section~\ref{s:UnboundReturnType}, adding a cast before constructor call gets rid of the issue. 
+\begin{cfa}
+T * new( TT p ) { return &(*(T * )malloc()){ p }; }
+\end{cfa}
+
+
+\subsection{Reused assertions in nested generic type}
+
+The following test of deeply nested dynamic generic type reveals that locally caching reused assertions is necessary, rather than just a resolver optimization, because recomputing assertions can result in bloated generated code size:
+\begin{cfa}
+struct nil {};
+forall( otype L, otype R )
+struct cons { L l; R r; };
+
+int main() {
+	#if   N==0
+	nil x;   
+	#elif N==1
+	cons( size_t, nil ) x; 
+	#elif N==2
+	cons( size_t, cons( size_t, nil ) ) x;
+	#elif N==3
+	cons( size_t, cons( size_t, cons( size_t, nil ) ) ) x;
+	// similarly for N=4,5,6
+	#endif
+}
+\end{cfa}
+At the declaration of @x@, it is implicitly initialized by generated constructor call, whose signature is given by
+\begin{cfa}
+forall( otype L, otype R ) void ?{}( cons( L, R ) & );
+\end{cfa}
+Note that the @otype@ constraint contains 4 assertions:
+\begin{cfa}
+void ?{}( L & ); // default constructor
+void ?{}( L &, L & ); // copy constructor
+void ^?{}( L & ); // destructor
+L & ?=?( L &, L & ); // assignment
+\end{cfa}
+Now since the right hand side of outermost cons is again a cons, recursive assertions are required. When the compiler cannot cache and reuse already resolved assertions, it becomes a problem, as each of those 4 pending assertions again asks for 4 more assertions one level below. Without any caching, number of resolved assertions grows exponentially, while that is obviously unnecessary since there are only $n+1$ different types involved. Even worse, this causes exponentially many wrapper functions generated later at the codegen step, and results in huge compiled binary.
+
+\begin{table}[h]
+\caption{Compilation results of nested cons test}
+\begin{tabular}{|r|r|r|}
+\hline
+N	& Assertions resolved	& Binary size (KB) \\
+\hline
+3	& 170		& 70	\\
+\hline
+4	& 682		& 219	\\
+\hline
+5	& 2,730		& 829	\\
+\hline
+6	& 10,922	& 3,261	\\
+\hline
+\end{tabular}
+\end{table}
+
+As the local functions are implemented by emitting executable code on the stack~\cite{gcc-nested-func}, it eventually means that compiled code also has exponential run time. This problem has evident practical implications, as nested collection types are frequently used in real production code.
+
+
+\section{Analysis of type system correctness}
+
+In Moss' thesis~\cite[\S~4.1.2,~p.~45]{Moss19}, the author presents the following example:
+\begin{quote}
+The cost model of \CFA precludes a greedy bottom-up resolution pass, as constraints and costs introduced by calls higher in the expression tree can change the interpretation of those lower in the tree, as in the following example:
+\begin{cfa}
+void f( int );
+double g$_1$( int );
+int g$_2$( long );
+f( g( 42 ) );
+\end{cfa}
+Considered independently, @g$_1$( 42 )@ is the cheapest interpretation of @g( 42 )@, with cost (0, 0, 0, 0, 0, 0, 0) since the argument type is an exact match. However, in context, an unsafe conversion is required to downcast the return type of @g1@ to an @int@ suitable for @f@, for a total cost of (1, 0, 0, 0, 0, 0, 0) for @f( g1( 42 ) )@. If @g2@ is chosen, on the other hand, there is a safe upcast from the int type of 42 to @long@, but no cast on the return of @g2@, for a total cost of (0, 0, 1, 0, 0, 0, 0) for @f( g2( 42 ) );@ as this is cheaper, @g2@ is chosen. Due to this design, all valid interpretations of subexpressions must in general be propagated to the top ...
+\end{quote}
+The cost model in the thesis sums up all costs in resolving sub-expressions and chooses the global optimal option. However, the current implementation in @cfa-cc@, based on the original Bilson model~\cite[\S~2.2.4,~p.~32]{Bilson03}, does eager resolution and only allows local optimal candidate selections to propagate upwards:
+\begin{quote}
+From the set of candidates whose parameter and argument types have been unified and whose assertions have been satisfied, those whose sub-expression interpretations have the smallest total cost of conversion are selected ... The total cost of conversion for each of these candidates is then calculated based on the implicit conversions and polymorphism involved in adapting the types of the sub-expression interpretations to the formal parameter types.
+\end{quote}
+With this model, the algorithm picks @g1@ in resolving the @f( g( 42 ) )@ call, which seems to be undesirable. 
+
+There are further evidence that shows the Bilson model is fundamentally incorrect, following the discussion of unbound return type in Section~\ref{s:UnboundReturnType}. By the conversion cost specification, a binding from a polymorphic type parameter to a concrete type incurs a polymorphic cost of 1. It remains unspecified \emph{when} the type parameters should become bound. When the parameterized types appear in the function parameters, they can be deduced from the argument type, and there is no ambiguity. In the unbound return case, however, the binding may happen at any stage in expression resolution, therefore it is impossible to define a unique local conversion cost. Note that type binding happens exactly once per parameter in resolving the entire expression, so the global binding cost is unambiguously 1. 
+
+As per the current compiler implementation, it does have a notable inconsistency in handling such case. For any unbound parameter that does \emph{not} come with an associated assertion, it remains unbound to the parent expression; for those that does however, they are immediately bound in the assertion resolution step, and concrete result types are used in the parent expressions. 
+
+Consider the following example:
+\begin{cfa}
+forall( dtype T ) T * f( void );
+void h( int * );
+\end{cfa}
+The expression @h( f() )@ eventually has a total cost of 1 from binding (T: int), but in the eager resolution model, the cost of 1 may occur either at call to @f@ or at call to @h@, and with the assertion resolution triggering a binding, the local cost of @f()@ is (0 poly, 0 spec) with no assertions, but (1 poly, -1 spec) with an assertion:
+\begin{cfa}
+forall( dtype T | { void g( T * ); } ) T * f( void );
+void g( int * );
+void h( int * );
+\end{cfa}
+and that contradicts the principle that adding assertions should make expression cost lower. Furthermore, the time at which type binding and assertion resolution happens is an implementation detail of the compiler, but not a part of language definition. That means two compliant \CFA compilers, one performing immediate assertion resolution at each step, and one delaying assertion resolution on unbound types, can produce different expression costs and therefore different candidate selection, making the language rule itself partially undefined and therefore unsound. By the above reasoning, the updated cost model using global sum of costs should be accepted as the standard. It also allows the compiler to freely choose when to resolve assertions, as the sum of total costs is independent of that choice; more optimizations regarding assertion resolution can also be implemented.
+
+
+\section{Timing results}
+
+For the timing results presented here, the \CFA compiler is built with gcc 9.3.0, and tested on a server machine running Ubuntu 20.04, 64GB RAM and 32-core 2.2 GHz CPU, results reported by the time command, and using only 8 cores in parallel such that the time is close to the case with 100% CPU utilization on a single thread.
+
+On the most recent build, the \CFA standard library (~1.3 MB of source code) compiles in 4 minutes 47 seconds total processor time (single thread equivalent), with the slowest file taking 13 seconds. The test suite (178 test cases, ~2.2MB of source code) completes within 25 minutes total processor time,\footnote{Including a few runtime tests; total time spent in compilation is approximately 21 minutes.} with the slowest file taking 23 seconds. In contrast, the library build on old compiler takes 85 minutes total, 5 minutes for the slowest file. Full test suite takes too long with old compiler build and is therefore not run, but the slowest test cases take approximately 5 minutes. Overall, the most recent build compared to old build in April 2020, before the project started, is consistently faster by a factor of 20.
+
+Additionally, 6 selected \CFA source files with distinct features from library and test suite are used to test compiler performance after each of the optimizations are implemented. Test files are from the most recent build and run through C preprocessor to eliminate the factor of header file changes. The selected tests are:
+\begin{itemize}
+\item
+@lib/fstream@ (112 KB)\footnote{File sizes are after preprocessing, with no line information (\lstinline|gcc -E -P|).}: implementation of I/O library
+\item
+@lib/mutex@ (166 KB): implementation of concurrency primitive
+\item
+@lib/vector@ (43 KB): container example, similar to \CC vector
+\item
+@lib/stdlib@ (64 KB): type-safe wrapper to @void *@-based C standard library functions
+\item
+@test/ISO2@ (55 KB): application of I/O library
+\item
+@test/thread@ (188 KB): application of threading library
+\end{itemize}
+
+The \CFA compiler builds are picked from git commit history that passed the test suite, and implement the optimizations incrementally:
+\begin{itemize}
+\item
+\#0 is the first working build of new AST data structure
+\item
+\#1 implements special symbol table and argument-dependent lookup
+\item
+\#2 implements late assertion satisfaction
+\item
+\#3 implements revised function type representation
+\item
+\#4 skips pruning on expressions with function type (most recent build)
+\end{itemize}
+The old resolver with no memory sharing and none of the optimizations above is also tested.
+\begin{table}
+\caption{Compile time of selected files by compiler build, in seconds}
+\begin{tabular}{|l|r|r|r|r|r|r|}
+\hline
+Test case		& old	& \#0	& \#1	& \#2	& \#3	& \#4	\\
+\hline
+@lib/fstream@	& 86.4	& 25.2	& 10.8	& 9.5	& 7.8	& 7.1	\\
+\hline
+@lib/mutex@		& 210.4	& 77.4	& 16.7	& 15.1	& 12.6	& 11.7	\\
+\hline
+@lib/vector@	& 17.2	& 8.9	& 3.1	& 2.8	& 2.4	& 2.2	\\
+\hline
+@lib/stdlib@	& 16.6	& 8.3	& 3.2	& 2.9	& 2.6	& 2.4	\\
+\hline
+@test/io2@		& 300.8	& 53.6	& 43.2	& 27.9	& 19.1	& 16.3	\\
+\hline
+@test/thread@	& 210.9	& 73.5	& 17.0	& 15.1	& 12.6	& 11.8	\\
+\hline
+\end{tabular}
+\smallskip
+\newline
+Results are average of 5 runs (3 runs if time exceeds 100 seconds)
+\end{table}
+
+
+\section{Conclusion}
+
+Over the course of 8 months of active research and development in \CFA type system and compiler algorithm, performance of the reference \CFA compiler, cfa-cc, has been greatly improved, allowing mid-sized \CFA programs to be compiled and built reasonably fast. As there are also ongoing efforts in the team on building a standard library, evaluating the runtime performance, and attempting to incorporate \CFA with existing software written in C, this project is especially meaningful for practical purposes.
+
+Analysis conducted in the project were based significantly on heuristics and practical evidence, as the theoretical bounds and average cases for the expression resolution problem differ. This approach was difficult at start to follow, with an unacceptably slow compiler, since running the program through debugger and validation tools (\eg @gdb@, @valgrind@) adds another order of magnitude to run time, which was already in minutes. However, near the end of the project, many significant improvements have already been made and new optimizations can be tested immediately. The positive feedback in development cycle benefits the \CFA team as a whole, more than just for the compiler optimizations.
+
+Some potential issues of the language that may happen frequently in practice have been identified. Due to the time constraint and complex nature of these problems, a handful of them remain unsolved, but some constructive proposals are made. Notably, introducing a local assertion cache in the resolver is a common solution for a few remaining problems, so that should be the focus of work soon.
+
+The \CFA team are planning on a public alpha release of the language as the compiler performance becomes promising, and other parts of the system, such as a standard library, are also being enhanced. Ideally, the remaining problems should be resolved before release, and the solutions will also be integral to drafting a formal specification. 
+
+\addcontentsline{toc}{section}{\refname}
+\bibliographystyle{plain}
+\bibliography{pl}
+
+\end{document}
+
+% Local Variables: %
+% tab-width: 4 %
+% fill-column: 100 %
+% compile-command: "make" %
+% End: %
Index: doc/theses/fangren_yu_COOP_S20/Report.tex
===================================================================
--- doc/theses/fangren_yu_COOP_S20/Report.tex	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ doc/theses/fangren_yu_COOP_S20/Report.tex	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -56,5 +56,5 @@
 
 \title{\Huge
-cfa-cc Developer's Reference
+\lstinline|cfa-cc| Developer's Reference
 }% title
 
@@ -728,4 +728,5 @@
 The \CFA compiler sets a limit on assertion depth and reports an error if assertion resolution does not terminate within the limit (as for \lstinline[language=C++]@templates@ in \CC).
 
+\addcontentsline{toc}{section}{\refname}
 \bibliographystyle{plain}
 \bibliography{pl}
Index: c/theses/fangren_yu_COOP_S20/figures/DeepNodeSharing.fig.bak
===================================================================
--- doc/theses/fangren_yu_COOP_S20/figures/DeepNodeSharing.fig.bak	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ 	(revision )
@@ -1,23 +1,0 @@
-#FIG 3.2  Produced by xfig version 3.2.5c
-Landscape
-Center
-Inches
-Letter  
-100.00
-Single
--2
-1200 2
-1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1800 1950 150 150 1800 1950 1950 1950
-1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1800 2550 150 150 1800 2550 1950 2550
-1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 2250 1350 150 150 2250 1350 2400 1350
-1 3 0 1 0 7 50 -1 -1 0.000 1 0.0000 1350 1350 150 150 1350 1350 1500 1350
-2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
-	 1350 1500 1800 1800
-2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
-	 2250 1500 1800 1800
-2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2
-	 1800 2100 1800 2400
-4 1 0 50 -1 0 12 0.0000 2 135 210 1350 1425 P1\001
-4 1 0 50 -1 0 12 0.0000 2 135 210 2250 1425 P2\001
-4 1 0 50 -1 0 12 0.0000 2 135 135 1800 2025 A\001
-4 1 0 50 -1 0 12 0.0000 2 135 135 1800 2625 B\001
Index: libcfa/src/heap.cfa
===================================================================
--- libcfa/src/heap.cfa	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ libcfa/src/heap.cfa	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Tue Dec 19 21:58:35 2017
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Dec 16 12:28:25 2020
-// Update Count     : 1023
+// Last Modified On : Sun Jan 10 11:20:49 2021
+// Update Count     : 1031
 //
 
@@ -262,21 +262,21 @@
 #ifdef __STATISTICS__
 // Heap statistics counters.
-static unsigned int malloc_calls;
+static unsigned int malloc_zero_calls, malloc_calls;
 static unsigned long long int malloc_storage;
-static unsigned int aalloc_calls;
+static unsigned int aalloc_zero_calls, aalloc_calls;
 static unsigned long long int aalloc_storage;
-static unsigned int calloc_calls;
+static unsigned int calloc_zero_calls, calloc_calls;
 static unsigned long long int calloc_storage;
-static unsigned int memalign_calls;
+static unsigned int memalign_zero_calls, memalign_calls;
 static unsigned long long int memalign_storage;
-static unsigned int amemalign_calls;
+static unsigned int amemalign_zero_calls, amemalign_calls;
 static unsigned long long int amemalign_storage;
-static unsigned int cmemalign_calls;
+static unsigned int cmemalign_zero_calls, cmemalign_calls;
 static unsigned long long int cmemalign_storage;
-static unsigned int resize_calls;
+static unsigned int resize_zero_calls, resize_calls;
 static unsigned long long int resize_storage;
-static unsigned int realloc_calls;
+static unsigned int realloc_zero_calls, realloc_calls;
 static unsigned long long int realloc_storage;
-static unsigned int free_calls;
+static unsigned int free_zero_calls, free_calls;
 static unsigned long long int free_storage;
 static unsigned int mmap_calls;
@@ -287,5 +287,5 @@
 static unsigned long long int sbrk_storage;
 // Statistics file descriptor (changed by malloc_stats_fd).
-static int stat_fd = STDERR_FILENO;						// default stderr
+static int stats_fd = STDERR_FILENO;					// default stderr
 
 // Use "write" because streams may be shutdown when calls are made.
@@ -293,29 +293,29 @@
 	char helpText[1024];
 	__cfaabi_bits_print_buffer( STDERR_FILENO, helpText, sizeof(helpText),
-									"\nHeap statistics:\n"
-									"  malloc: calls %u / storage %llu\n"
-									"  aalloc: calls %u / storage %llu\n"
-									"  calloc: calls %u / storage %llu\n"
-									"  memalign: calls %u / storage %llu\n"
-									"  amemalign: calls %u / storage %llu\n"
-									"  cmemalign: calls %u / storage %llu\n"
-									"  resize: calls %u / storage %llu\n"
-									"  realloc: calls %u / storage %llu\n"
-									"  free: calls %u / storage %llu\n"
-									"  mmap: calls %u / storage %llu\n"
-									"  munmap: calls %u / storage %llu\n"
-									"  sbrk: calls %u / storage %llu\n",
-									malloc_calls, malloc_storage,
-									aalloc_calls, aalloc_storage,
-									calloc_calls, calloc_storage,
-									memalign_calls, memalign_storage,
-									amemalign_calls, amemalign_storage,
-									cmemalign_calls, cmemalign_storage,
-									resize_calls, resize_storage,
-									realloc_calls, realloc_storage,
-									free_calls, free_storage,
-									mmap_calls, mmap_storage,
-									munmap_calls, munmap_storage,
-									sbrk_calls, sbrk_storage
+								"\nHeap statistics:\n"
+								"  malloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  aalloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  calloc    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  memalign  0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  amemalign 0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  cmemalign 0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  resize    0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  realloc   0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  free      0-calls %'u; >0-calls %'u; storage %'llu bytes\n"
+								"  mmap      calls %'u; storage %'llu bytes\n"
+								"  munmap    calls %'u; storage %'llu bytes\n"
+								"  sbrk      calls %'u; storage %'llu bytes\n",
+								malloc_zero_calls, malloc_calls, malloc_storage,
+								aalloc_zero_calls, aalloc_calls, aalloc_storage,
+								calloc_zero_calls, calloc_calls, calloc_storage,
+								memalign_zero_calls, memalign_calls, memalign_storage,
+								amemalign_zero_calls, amemalign_calls, amemalign_storage,
+								cmemalign_zero_calls, cmemalign_calls, cmemalign_storage,
+								resize_zero_calls, resize_calls, resize_storage,
+								realloc_zero_calls, realloc_calls, realloc_storage,
+								free_zero_calls, free_calls, free_storage,
+								mmap_calls, mmap_storage,
+								munmap_calls, munmap_storage,
+								sbrk_calls, sbrk_storage
 		);
 } // printStats
@@ -328,26 +328,26 @@
 						"<sizes>\n"
 						"</sizes>\n"
-						"<total type=\"malloc\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"aalloc\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"calloc\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"memalign\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"amemalign\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"cmemalign\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"resize\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"realloc\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"free\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"mmap\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"munmap\" count=\"%u\" size=\"%llu\"/>\n"
-						"<total type=\"sbrk\" count=\"%u\" size=\"%llu\"/>\n"
+						"<total type=\"malloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"aalloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"calloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"memalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"amemalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"cmemalign\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"resize\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"realloc\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"free\" 0 count=\"%'u;\" >0 count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"mmap\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"munmap\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
+						"<total type=\"sbrk\" count=\"%'u;\" size=\"%'llu\"/> bytes\n"
 						"</malloc>",
-						malloc_calls, malloc_storage,
-						aalloc_calls, aalloc_storage,
-						calloc_calls, calloc_storage,
-						memalign_calls, memalign_storage,
-						amemalign_calls, amemalign_storage,
-						cmemalign_calls, cmemalign_storage,
-						resize_calls, resize_storage,
-						realloc_calls, realloc_storage,
-						free_calls, free_storage,
+						malloc_zero_calls, malloc_calls, malloc_storage,
+						aalloc_zero_calls, aalloc_calls, aalloc_storage,
+						calloc_zero_calls, calloc_calls, calloc_storage,
+						memalign_zero_calls, memalign_calls, memalign_storage,
+						amemalign_zero_calls, amemalign_calls, amemalign_storage,
+						cmemalign_zero_calls, cmemalign_calls, cmemalign_storage,
+						resize_zero_calls, resize_calls, resize_storage,
+						realloc_zero_calls, realloc_calls, realloc_storage,
+						free_zero_calls, free_calls, free_storage,
 						mmap_calls, mmap_storage,
 						munmap_calls, munmap_storage,
@@ -466,21 +466,21 @@
 } // headers
 
-#ifdef __CFA_DEBUG__
-#if __SIZEOF_POINTER__ == 4
-#define MASK 0xdeadbeef
-#else
-#define MASK 0xdeadbeefdeadbeef
-#endif
-#define STRIDE size_t
-
-static void * Memset( void * addr, STRIDE size ) {		// debug only
-	if ( size % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, size %zd not multiple of %zd.", size, sizeof(STRIDE) );
-	if ( (STRIDE)addr % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, addr %p not multiple of %zd.", addr, sizeof(STRIDE) );
-
-	STRIDE * end = (STRIDE *)addr + size / sizeof(STRIDE);
-	for ( STRIDE * p = (STRIDE *)addr; p < end; p += 1 ) *p = MASK;
-	return addr;
-} // Memset
-#endif // __CFA_DEBUG__
+// #ifdef __CFA_DEBUG__
+// #if __SIZEOF_POINTER__ == 4
+// #define MASK 0xdeadbeef
+// #else
+// #define MASK 0xdeadbeefdeadbeef
+// #endif
+// #define STRIDE size_t
+
+// static void * Memset( void * addr, STRIDE size ) {		// debug only
+// 	if ( size % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, size %zd not multiple of %zd.", size, sizeof(STRIDE) );
+// 	if ( (STRIDE)addr % sizeof(STRIDE) != 0 ) abort( "Memset() : internal error, addr %p not multiple of %zd.", addr, sizeof(STRIDE) );
+
+// 	STRIDE * end = (STRIDE *)addr + size / sizeof(STRIDE);
+// 	for ( STRIDE * p = (STRIDE *)addr; p < end; p += 1 ) *p = MASK;
+// 	return addr;
+// } // Memset
+// #endif // __CFA_DEBUG__
 
 
@@ -498,6 +498,7 @@
 			unlock( extlock );
 			__cfaabi_bits_print_nolock( STDERR_FILENO, NO_MEMORY_MSG, size );
-			_exit( EXIT_FAILURE );
-		} // if
+			_exit( EXIT_FAILURE );						// give up
+		} // if
+		// Make storage executable for thunks.
 		if ( mprotect( (char *)heapEnd + heapRemaining, increase, __map_prot ) ) {
 			unlock( extlock );
@@ -770,30 +771,4 @@
 
 
-static inline void * callocNoStats( size_t dim, size_t elemSize ) {
-	size_t size = dim * elemSize;
-  if ( unlikely( size ) == 0 ) return 0p;				// 0 BYTE ALLOCATION RETURNS NULL POINTER
-	char * addr = (char *)mallocNoStats( size );
-
-	HeapManager.Storage.Header * header;
-	HeapManager.FreeHeader * freeElem;
-	size_t bsize, alignment;
-	#ifndef __CFA_DEBUG__
-	bool mapped =
-	#endif // __CFA_DEBUG__
-		headers( "calloc", addr, header, freeElem, bsize, alignment );
-	#ifndef __CFA_DEBUG__
-
-	// Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
-	if ( ! mapped )
-	#endif // __CFA_DEBUG__
-		// <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
-		// `-header`-addr                      `-size
-		memset( addr, '\0', size );						// set to zeros
-
-	header->kind.real.blockSize |= 2;					// mark as zero filled
-	return addr;
-} // callocNoStats
-
-
 static inline void * memalignNoStats( size_t alignment, size_t size ) {
   if ( unlikely( size ) == 0 ) return 0p;				// 0 BYTE ALLOCATION RETURNS NULL POINTER
@@ -834,30 +809,4 @@
 
 
-static inline void * cmemalignNoStats( size_t alignment, size_t dim, size_t elemSize ) {
-	size_t size = dim * elemSize;
-  if ( unlikely( size ) == 0 ) return 0p;				// 0 BYTE ALLOCATION RETURNS NULL POINTER
-	char * addr = (char *)memalignNoStats( alignment, size );
-
-	HeapManager.Storage.Header * header;
-	HeapManager.FreeHeader * freeElem;
-	size_t bsize;
-	#ifndef __CFA_DEBUG__
-	bool mapped =
-	#endif // __CFA_DEBUG__
-		headers( "cmemalign", addr, header, freeElem, bsize, alignment );
-
-	// Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
-	#ifndef __CFA_DEBUG__
-	if ( ! mapped )
-	#endif // __CFA_DEBUG__
-		// <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
-		// `-header`-addr                      `-size
-		memset( addr, '\0', size );						// set to zeros
-
-	header->kind.real.blockSize |= 2;					// mark as zero filled
-	return addr;
-} // cmemalignNoStats
-
-
 extern "C" {
 	// Allocates size bytes and returns a pointer to the allocated memory.  The contents are undefined. If size is 0,
@@ -865,6 +814,10 @@
 	void * malloc( size_t size ) {
 		#ifdef __STATISTICS__
-		__atomic_add_fetch( &malloc_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &malloc_storage, size, __ATOMIC_SEQ_CST );
+		if ( likely( size > 0 ) ) {
+			__atomic_add_fetch( &malloc_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &malloc_storage, size, __ATOMIC_SEQ_CST );
+		} else {
+			__atomic_add_fetch( &malloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+		} // if
 		#endif // __STATISTICS__
 
@@ -877,6 +830,10 @@
 		size_t size = dim * elemSize;
 		#ifdef __STATISTICS__
-		__atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &aalloc_storage, size, __ATOMIC_SEQ_CST );
+		if ( likely( size > 0 ) ) {
+			__atomic_add_fetch( &aalloc_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &aalloc_storage, size, __ATOMIC_SEQ_CST );
+		} else {
+			__atomic_add_fetch( &aalloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+		} // if
 		#endif // __STATISTICS__
 
@@ -887,4 +844,11 @@
 	// Same as aalloc() with memory set to zero.
 	void * calloc( size_t dim, size_t elemSize ) {
+		size_t size = dim * elemSize;
+	  if ( unlikely( size ) == 0 ) {			// 0 BYTE ALLOCATION RETURNS NULL POINTER
+			#ifdef __STATISTICS__
+			__atomic_add_fetch( &calloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+			#endif // __STATISTICS__
+			return 0p;
+		} // if
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &calloc_calls, 1, __ATOMIC_SEQ_CST );
@@ -892,5 +856,25 @@
 		#endif // __STATISTICS__
 
-		return callocNoStats( dim, elemSize );
+		char * addr = (char *)mallocNoStats( size );
+
+		HeapManager.Storage.Header * header;
+		HeapManager.FreeHeader * freeElem;
+		size_t bsize, alignment;
+
+		#ifndef __CFA_DEBUG__
+		bool mapped =
+			#endif // __CFA_DEBUG__
+			headers( "calloc", addr, header, freeElem, bsize, alignment );
+
+		#ifndef __CFA_DEBUG__
+		// Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
+		if ( ! mapped )
+		#endif // __CFA_DEBUG__
+			// <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
+			// `-header`-addr                      `-size
+			memset( addr, '\0', size );					// set to zeros
+
+		header->kind.real.blockSize |= 2;				// mark as zero filled
+		return addr;
 	} // calloc
 
@@ -901,10 +885,16 @@
 	// call to malloc(), alloc(), calloc() or realloc(). If the area pointed to was moved, a free(oaddr) is done.
 	void * resize( void * oaddr, size_t size ) {
+		// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+	  if ( unlikely( size == 0 ) ) {					// special cases
+			#ifdef __STATISTICS__
+			__atomic_add_fetch( &resize_zero_calls, 1, __ATOMIC_SEQ_CST );
+			#endif // __STATISTICS__
+			free( oaddr );
+			return 0p;
+		} // if
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 
-		// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
-	  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
 	  if ( unlikely( oaddr == 0p ) ) {
 			#ifdef __STATISTICS__
@@ -918,6 +908,6 @@
 		size_t bsize, oalign;
 		headers( "resize", oaddr, header, freeElem, bsize, oalign );
+
 		size_t odsize = dataStorage( bsize, oaddr, header ); // data storage available in bucket
-
 		// same size, DO NOT preserve STICKY PROPERTIES.
 		if ( oalign == libAlign() && size <= odsize && odsize <= size * 2 ) { // allow 50% wasted storage for smaller size
@@ -940,10 +930,16 @@
 	// the old and new sizes.
 	void * realloc( void * oaddr, size_t size ) {
+		// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+	  if ( unlikely( size == 0 ) ) {					// special cases
+			#ifdef __STATISTICS__
+			__atomic_add_fetch( &realloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+			#endif // __STATISTICS__
+			free( oaddr );
+			return 0p;
+		} // if
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &realloc_calls, 1, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
 
-		// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
-	  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
 	  if ( unlikely( oaddr == 0p ) ) {
 			#ifdef __STATISTICS__
@@ -999,6 +995,10 @@
 	void * memalign( size_t alignment, size_t size ) {
 		#ifdef __STATISTICS__
-		__atomic_add_fetch( &memalign_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &memalign_storage, size, __ATOMIC_SEQ_CST );
+		if ( likely( size > 0 ) ) {
+			__atomic_add_fetch( &memalign_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &memalign_storage, size, __ATOMIC_SEQ_CST );
+		} else {
+			__atomic_add_fetch( &memalign_zero_calls, 1, __ATOMIC_SEQ_CST );
+		} // if
 		#endif // __STATISTICS__
 
@@ -1011,6 +1011,10 @@
 		size_t size = dim * elemSize;
 		#ifdef __STATISTICS__
-		__atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
-		__atomic_add_fetch( &cmemalign_storage, size, __ATOMIC_SEQ_CST );
+		if ( likely( size > 0 ) ) {
+			__atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
+			__atomic_add_fetch( &cmemalign_storage, size, __ATOMIC_SEQ_CST );
+		} else {
+			__atomic_add_fetch( &cmemalign_zero_calls, 1, __ATOMIC_SEQ_CST );
+		} // if
 		#endif // __STATISTICS__
 
@@ -1021,4 +1025,11 @@
 	// Same as calloc() with memory alignment.
 	void * cmemalign( size_t alignment, size_t dim, size_t elemSize ) {
+		size_t size = dim * elemSize;
+	  if ( unlikely( size ) == 0 ) {					// 0 BYTE ALLOCATION RETURNS NULL POINTER
+			#ifdef __STATISTICS__
+			__atomic_add_fetch( &cmemalign_zero_calls, 1, __ATOMIC_SEQ_CST );
+			#endif // __STATISTICS__
+			return 0p;
+		} // if
 		#ifdef __STATISTICS__
 		__atomic_add_fetch( &cmemalign_calls, 1, __ATOMIC_SEQ_CST );
@@ -1026,5 +1037,25 @@
 		#endif // __STATISTICS__
 
-		return cmemalignNoStats( alignment, dim, elemSize );
+		char * addr = (char *)memalignNoStats( alignment, size );
+
+		HeapManager.Storage.Header * header;
+		HeapManager.FreeHeader * freeElem;
+		size_t bsize;
+
+		#ifndef __CFA_DEBUG__
+		bool mapped =
+			#endif // __CFA_DEBUG__
+			headers( "cmemalign", addr, header, freeElem, bsize, alignment );
+
+		// Mapped storage is zero filled, but in debug mode mapped memory is scrubbed in doMalloc, so it has to be reset to zero.
+		#ifndef __CFA_DEBUG__
+		if ( ! mapped )
+		#endif // __CFA_DEBUG__
+			// <-------0000000000000000000000000000UUUUUUUUUUUUUUUUUUUUUUUUU> bsize (bucket size) U => undefined
+			// `-header`-addr                      `-size
+			memset( addr, '\0', size );					// set to zeros
+
+		header->kind.real.blockSize |= 2;				// mark as zero filled
+		return addr;
 	} // cmemalign
 
@@ -1065,9 +1096,9 @@
 	// 0p, no operation is performed.
 	void free( void * addr ) {
-		#ifdef __STATISTICS__
-		__atomic_add_fetch( &free_calls, 1, __ATOMIC_SEQ_CST );
-		#endif // __STATISTICS__
-
 	  if ( unlikely( addr == 0p ) ) {					// special case
+			#ifdef __STATISTICS__
+			__atomic_add_fetch( &free_zero_calls, 1, __ATOMIC_SEQ_CST );
+			#endif // __STATISTICS__
+
 			// #ifdef __CFA_DEBUG__
 			// if ( traceHeap() ) {
@@ -1182,6 +1213,6 @@
 	int malloc_stats_fd( int fd __attribute__(( unused )) ) {
 		#ifdef __STATISTICS__
-		int temp = stat_fd;
-		stat_fd = fd;
+		int temp = stats_fd;
+		stats_fd = fd;
 		return temp;
 		#else
@@ -1214,5 +1245,5 @@
 	// The string is printed on the file stream stream.  The exported string includes information about all arenas (see
 	// malloc).
-	int malloc_info( int options, FILE * stream ) {
+	int malloc_info( int options, FILE * stream __attribute__(( unused )) ) {
 	  if ( options != 0 ) { errno = EINVAL; return -1; }
 		#ifdef __STATISTICS__
@@ -1243,18 +1274,21 @@
 // Must have CFA linkage to overload with C linkage realloc.
 void * resize( void * oaddr, size_t nalign, size_t size ) {
-	#ifdef __STATISTICS__
-	__atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
-	#endif // __STATISTICS__
+	// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+  if ( unlikely( size == 0 ) ) {						// special cases
+		#ifdef __STATISTICS__
+		__atomic_add_fetch( &resize_zero_calls, 1, __ATOMIC_SEQ_CST );
+		#endif // __STATISTICS__
+		free( oaddr );
+		return 0p;
+	} // if
 
 	if ( unlikely( nalign < libAlign() ) ) nalign = libAlign(); // reset alignment to minimum
 	#ifdef __CFA_DEBUG__
-	else
-		checkAlign( nalign );							// check alignment
+	else checkAlign( nalign );							// check alignment
 	#endif // __CFA_DEBUG__
 
-	// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
-  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
   if ( unlikely( oaddr == 0p ) ) {
 		#ifdef __STATISTICS__
+		__atomic_add_fetch( &resize_calls, 1, __ATOMIC_SEQ_CST );
 		__atomic_add_fetch( &resize_storage, size, __ATOMIC_SEQ_CST );
 		#endif // __STATISTICS__
@@ -1302,12 +1336,18 @@
 
 void * realloc( void * oaddr, size_t nalign, size_t size ) {
+	// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
+  if ( unlikely( size == 0 ) ) {						// special cases
+		#ifdef __STATISTICS__
+		__atomic_add_fetch( &realloc_zero_calls, 1, __ATOMIC_SEQ_CST );
+		#endif // __STATISTICS__
+		free( oaddr );
+		return 0p;
+	} // if
+
 	if ( unlikely( nalign < libAlign() ) ) nalign = libAlign(); // reset alignment to minimum
 	#ifdef __CFA_DEBUG__
-	else
-		checkAlign( nalign );							// check alignment
+	else checkAlign( nalign );							// check alignment
 	#endif // __CFA_DEBUG__
 
-	// If size is equal to 0, either NULL or a pointer suitable to be passed to free() is returned.
-  if ( unlikely( size == 0 ) ) { free( oaddr ); return 0p; } // special cases
   if ( unlikely( oaddr == 0p ) ) {
 		#ifdef __STATISTICS__
Index: libcfa/src/startup.cfa
===================================================================
--- libcfa/src/startup.cfa	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ libcfa/src/startup.cfa	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,10 +10,11 @@
 // Created On       : Tue Jul 24 16:21:57 2018
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Tue Feb  4 13:03:18 2020
-// Update Count     : 30
+// Last Modified On : Sat Jan  9 23:18:23 2021
+// Update Count     : 34
 //
 
-#include <time.h>	         // tzset
-#include <locale.h>        // setlocale
+#include <time.h>										// tzset
+#include <locale.h>										// setlocale
+#include <stdlib.h>										// getenv
 #include "startup.hfa"
 
@@ -22,5 +23,5 @@
     void __cfaabi_appready_startup( void ) {
 		tzset();										// initialize time global variables
-		setlocale(LC_NUMERIC, "");
+		setlocale( LC_NUMERIC, getenv("LANG") );
 		#ifdef __CFA_DEBUG__
 		extern void heapAppStart();
Index: src/AST/Decl.cpp
===================================================================
--- src/AST/Decl.cpp	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/AST/Decl.cpp	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Thu May 9 10:00:00 2019
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Dec 13 16:23:15 2019
-// Update Count     : 20
+// Last Modified On : Tue Jan 12 16:54:55 2021
+// Update Count     : 23
 //
 
@@ -78,6 +78,6 @@
 
 const char * TypeDecl::typeString() const {
-	static const char * kindNames[] = { "sized data type", "sized object type", "sized function type", "sized tuple type" };
-	static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
+	static const char * kindNames[] = { "sized data type", "sized data type", "sized object type", "sized function type", "sized tuple type", "sized array length type" };
+	static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
 	assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
 	return sized ? kindNames[ kind ] : &kindNames[ kind ][ sizeof("sized") ]; // sizeof includes '\0'
@@ -85,6 +85,6 @@
 
 const char * TypeDecl::genTypeString() const {
-	static const char * kindNames[] = { "dtype", "otype", "ftype", "ttype" };
-	static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
+	static const char * kindNames[] = { "T &", "T *", "T", "(*)", "T ...", "[T]" };
+	static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
 	assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
 	return kindNames[ kind ];
Index: src/AST/Decl.hpp
===================================================================
--- src/AST/Decl.hpp	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/AST/Decl.hpp	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Thu May 9 10:00:00 2019
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Dec 13 17:38:33 2019
-// Update Count     : 29
+// Last Modified On : Mon Jan 11 20:48:38 2021
+// Update Count     : 30
 //
 
@@ -175,5 +175,5 @@
 class TypeDecl final : public NamedTypeDecl {
   public:
-	enum Kind { Dtype, Otype, Ftype, Ttype, NUMBER_OF_KINDS };
+	enum Kind { Dtype, DStype, Otype, Ftype, Ttype, ALtype, NUMBER_OF_KINDS };
 
 	Kind kind;
Index: src/Parser/DeclarationNode.cc
===================================================================
--- src/Parser/DeclarationNode.cc	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/Parser/DeclarationNode.cc	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Sat May 16 12:34:05 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Oct  8 08:03:38 2020
-// Update Count     : 1135
+// Last Modified On : Mon Jan 11 20:58:07 2021
+// Update Count     : 1137
 //
 
@@ -1075,6 +1075,6 @@
 	if ( variable.tyClass != TypeDecl::NUMBER_OF_KINDS ) {
 		// otype is internally converted to dtype + otype parameters
-		static const TypeDecl::Kind kindMap[] = { TypeDecl::Dtype, TypeDecl::Dtype, TypeDecl::Ftype, TypeDecl::Ttype };
-		static_assert( sizeof(kindMap)/sizeof(kindMap[0]) == TypeDecl::NUMBER_OF_KINDS, "DeclarationNode::build: kindMap is out of sync." );
+		static const TypeDecl::Kind kindMap[] = { TypeDecl::Dtype, TypeDecl::DStype, TypeDecl::Dtype, TypeDecl::Ftype, TypeDecl::Ttype, TypeDecl::ALtype };
+		static_assert( sizeof(kindMap) / sizeof(kindMap[0]) == TypeDecl::NUMBER_OF_KINDS, "DeclarationNode::build: kindMap is out of sync." );
 		assertf( variable.tyClass < sizeof(kindMap)/sizeof(kindMap[0]), "Variable's tyClass is out of bounds." );
 		TypeDecl * ret = new TypeDecl( *name, Type::StorageClasses(), nullptr, kindMap[ variable.tyClass ], variable.tyClass == TypeDecl::Otype, variable.initializer ? variable.initializer->buildType() : nullptr );
Index: src/Parser/ParseNode.h
===================================================================
--- src/Parser/ParseNode.h	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/Parser/ParseNode.h	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Sat May 16 13:28:16 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sat Oct 24 03:53:54 2020
-// Update Count     : 895
+// Last Modified On : Sun Jan  3 18:23:01 2021
+// Update Count     : 896
 //
 
@@ -39,6 +39,6 @@
 struct DeclarationNode;
 class DeclarationWithType;
+class Initializer;
 class ExpressionNode;
-class Initializer;
 struct StatementNode;
 
Index: src/Parser/parser.yy
===================================================================
--- src/Parser/parser.yy	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/Parser/parser.yy	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Sat Sep  1 20:22:55 2001
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sat Oct 24 08:21:14 2020
-// Update Count     : 4624
+// Last Modified On : Mon Jan 11 21:32:10 2021
+// Update Count     : 4633
 //
 
@@ -329,5 +329,5 @@
 %type<en> conditional_expression		constant_expression			assignment_expression		assignment_expression_opt
 %type<en> comma_expression				comma_expression_opt
-%type<en> argument_expression_list_opt	argument_expression			default_initialize_opt
+%type<en> argument_expression_list_opt	argument_expression			default_initializer_opt
 %type<ifctl> if_control_expression
 %type<fctl> for_control_expression		for_control_expression_list
@@ -424,5 +424,5 @@
 %type<decl> sue_declaration_specifier sue_declaration_specifier_nobody sue_type_specifier sue_type_specifier_nobody
 
-%type<tclass> type_class
+%type<tclass> type_class new_type_class
 %type<decl> type_declarator type_declarator_name type_declaring_list
 
@@ -1545,4 +1545,5 @@
 	| cfa_function_declaration
 	| type_declaring_list
+		{ SemanticError( yylloc, "otype declaration is currently unimplemented." ); $$ = nullptr; }
 	| trait_specifier
 	;
@@ -2223,5 +2224,5 @@
 	;
 
-cfa_parameter_ellipsis_list_opt:							// CFA, abstract + real
+cfa_parameter_ellipsis_list_opt:						// CFA, abstract + real
 	// empty
 		{ $$ = DeclarationNode::newBasicType( DeclarationNode::Void ); }
@@ -2280,10 +2281,10 @@
 cfa_parameter_declaration:								// CFA, new & old style parameter declaration
 	parameter_declaration
-	| cfa_identifier_parameter_declarator_no_tuple identifier_or_type_name default_initialize_opt
+	| cfa_identifier_parameter_declarator_no_tuple identifier_or_type_name default_initializer_opt
 		{ $$ = $1->addName( $2 ); }
-	| cfa_abstract_tuple identifier_or_type_name default_initialize_opt
+	| cfa_abstract_tuple identifier_or_type_name default_initializer_opt
 		// To obtain LR(1), these rules must be duplicated here (see cfa_abstract_declarator).
 		{ $$ = $1->addName( $2 ); }
-	| type_qualifier_list cfa_abstract_tuple identifier_or_type_name default_initialize_opt
+	| type_qualifier_list cfa_abstract_tuple identifier_or_type_name default_initializer_opt
 		{ $$ = $2->addName( $3 )->addQualifiers( $1 ); }
 	| cfa_function_specifier
@@ -2302,14 +2303,14 @@
 parameter_declaration:
 		// No SUE declaration in parameter list.
-	declaration_specifier_nobody identifier_parameter_declarator default_initialize_opt
+	declaration_specifier_nobody identifier_parameter_declarator default_initializer_opt
 		{ $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
-	| declaration_specifier_nobody type_parameter_redeclarator default_initialize_opt
+	| declaration_specifier_nobody type_parameter_redeclarator default_initializer_opt
 		{ $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
 	;
 
 abstract_parameter_declaration:
-	declaration_specifier_nobody default_initialize_opt
+	declaration_specifier_nobody default_initializer_opt
 		{ $$ = $1->addInitializer( $2 ? new InitializerNode( $2 ) : nullptr ); }
-	| declaration_specifier_nobody abstract_parameter_declarator default_initialize_opt
+	| declaration_specifier_nobody abstract_parameter_declarator default_initializer_opt
 		{ $$ = $2->addType( $1 )->addInitializer( $3 ? new InitializerNode( $3 ) : nullptr ); }
 	;
@@ -2441,9 +2442,29 @@
 	type_class identifier_or_type_name
 		{ typedefTable.addToScope( *$2, TYPEDEFname, "9" ); }
-	type_initializer_opt assertion_list_opt
+	  type_initializer_opt assertion_list_opt
 		{ $$ = DeclarationNode::newTypeParam( $1, $2 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
-	| type_specifier identifier_parameter_declarator
+	| identifier_or_type_name new_type_class
+		{ typedefTable.addToScope( *$1, TYPEDEFname, "9" ); }
+	  type_initializer_opt assertion_list_opt
+		{ $$ = DeclarationNode::newTypeParam( $2, $1 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
+	| '[' identifier_or_type_name ']'
+		{
+			typedefTable.addToScope( *$2, TYPEDEFname, "9" );
+			$$ = DeclarationNode::newTypeParam( TypeDecl::ALtype, $2 );
+		}
+	// | type_specifier identifier_parameter_declarator
 	| assertion_list
 		{ $$ = DeclarationNode::newTypeParam( TypeDecl::Dtype, new string( DeclarationNode::anonymous.newName() ) )->addAssertions( $1 ); }
+	;
+
+new_type_class:											// CFA
+	// empty
+		{ $$ = TypeDecl::Otype; }
+	| '&'
+		{ $$ = TypeDecl::Dtype; }
+	| '*'
+		{ $$ = TypeDecl::DStype; }						// dtype + sized
+	| ELLIPSIS
+		{ $$ = TypeDecl::Ttype; }
 	;
 
@@ -3476,5 +3497,5 @@
 	;
 
-default_initialize_opt:
+default_initializer_opt:
 	// empty
 		{ $$ = nullptr; }
Index: src/SymTab/Demangle.cc
===================================================================
--- src/SymTab/Demangle.cc	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/SymTab/Demangle.cc	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Thu Jul 19 12:52:41 2018
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Tue Feb 11 15:09:18 2020
-// Update Count     : 10
+// Last Modified On : Mon Jan 11 21:28:27 2021
+// Update Count     : 11
 //
 
@@ -367,5 +367,5 @@
 				// type variable types
 				for (size_t k = 0; k < TypeDecl::NUMBER_OF_KINDS; ++k) {
-					static const std::string typeVariableNames[] = { "DT", "OT", "FT", "TT", };
+					static const std::string typeVariableNames[] = { "DT", "DST", "OT", "FT", "TT", "ALT", };
 					static_assert(
 						sizeof(typeVariableNames)/sizeof(typeVariableNames[0]) == TypeDecl::NUMBER_OF_KINDS,
Index: src/SymTab/Mangler.cc
===================================================================
--- src/SymTab/Mangler.cc	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/SymTab/Mangler.cc	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Sun May 17 21:40:29 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Wed Nov 18 12:01:38 2020
-// Update Count     : 64
+// Last Modified On : Mon Jan 11 21:56:06 2021
+// Update Count     : 74
 //
 #include "Mangler.h"
@@ -313,5 +313,5 @@
 				// and the case has not yet come up in practice. Alternatively, if not then this code can be removed
 				// aside from the assert false.
-				assertf(false, "Mangler_old should not visit typedecl: %s", toCString(decl));
+				assertf( false, "Mangler_old should not visit typedecl: %s", toCString(decl));
 				assertf( decl->kind < TypeDecl::NUMBER_OF_KINDS, "Unhandled type variable kind: %d", decl->kind );
 				mangleName += Encoding::typeVariables[ decl->kind ] + std::to_string( decl->name.length() ) + decl->name;
@@ -343,5 +343,5 @@
 							break;
 						  default:
-							assert( false );
+							assertf( false, "unimplemented kind for type variable %s", SymTab::Mangler::Encoding::typeVariables[i->kind].c_str() );
 						} // switch
 						varNums[ i->name ] = std::make_pair( nextVarNum, (int)i->kind );
@@ -673,15 +673,15 @@
 					for ( auto & decl : ptype->forall ) {
 						switch ( decl->kind ) {
-						case ast::TypeDecl::Kind::Dtype:
+						  case ast::TypeDecl::Kind::Dtype:
 							dcount++;
 							break;
-						case ast::TypeDecl::Kind::Ftype:
+						  case ast::TypeDecl::Kind::Ftype:
 							fcount++;
 							break;
-						case ast::TypeDecl::Kind::Ttype:
+						  case ast::TypeDecl::Kind::Ttype:
 							vcount++;
 							break;
-						default:
-							assert( false );
+						  default:
+							assertf( false, "unimplemented kind for type variable %s", SymTab::Mangler::Encoding::typeVariables[decl->kind].c_str() );
 						} // switch
 						varNums[ decl->name ] = std::make_pair( nextVarNum, (int)decl->kind );
Index: src/SymTab/ManglerCommon.cc
===================================================================
--- src/SymTab/ManglerCommon.cc	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/SymTab/ManglerCommon.cc	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Sun May 17 21:44:03 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Dec 13 14:54:38 2019
-// Update Count     : 28
+// Last Modified On : Mon Jan 11 21:23:10 2021
+// Update Count     : 29
 //
 
@@ -104,10 +104,12 @@
 			const std::string typeVariables[] = {
 				"BD", // dtype
+				"BDS", // dtype + sized
 				"BO", // otype
 				"BF", // ftype
 				"BT", // ttype
+				"BAL", // array length type
 			};
 			static_assert(
-				sizeof(typeVariables)/sizeof(typeVariables[0]) == TypeDecl::NUMBER_OF_KINDS,
+				sizeof(typeVariables) / sizeof(typeVariables[0]) == TypeDecl::NUMBER_OF_KINDS,
 				"Each type variable kind should have a corresponding mangler prefix"
 			);
Index: src/SynTree/Declaration.h
===================================================================
--- src/SynTree/Declaration.h	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/SynTree/Declaration.h	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Mon May 18 07:44:20 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Fri Dec 13 23:11:22 2019
-// Update Count     : 157
+// Last Modified On : Mon Jan 11 20:48:39 2021
+// Update Count     : 158
 //
 
@@ -201,5 +201,5 @@
 	typedef NamedTypeDecl Parent;
   public:
-	enum Kind { Dtype, Otype, Ftype, Ttype, NUMBER_OF_KINDS };
+	enum Kind { Dtype, DStype, Otype, Ftype, Ttype, ALtype, NUMBER_OF_KINDS };
 
 	Kind kind;
Index: src/SynTree/TypeDecl.cc
===================================================================
--- src/SynTree/TypeDecl.cc	(revision 97748eeb1df13abd9185514085cc78246db06552)
+++ src/SynTree/TypeDecl.cc	(revision a00bc5b670e302caf7b7a128c32475c702fd7890)
@@ -10,6 +10,6 @@
 // Created On       : Mon May 18 07:44:20 2015
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Thu Oct  8 18:18:55 2020
-// Update Count     : 22
+// Last Modified On : Tue Jan 12 16:07:33 2021
+// Update Count     : 26
 //
 
@@ -33,6 +33,6 @@
 
 const char * TypeDecl::typeString() const {
-	static const char * kindNames[] = { "sized data type", "sized object type", "sized function type", "sized tuple type" };
-	static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
+	static const char * kindNames[] = { "sized data type", "sized data type", "sized object type", "sized function type", "sized tuple type", "sized array length type" };
+	static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "typeString: kindNames is out of sync." );
 	assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
 	return isComplete() ? kindNames[ kind ] : &kindNames[ kind ][ sizeof("sized") ]; // sizeof includes '\0'
@@ -40,6 +40,6 @@
 
 const char * TypeDecl::genTypeString() const {
-	static const char * kindNames[] = { "dtype", "otype", "ftype", "ttype" };
-	static_assert( sizeof(kindNames)/sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
+	static const char * kindNames[] = { "T &", "T *", "T", "(*)", "T ...", "[T]" };
+	static_assert( sizeof(kindNames) / sizeof(kindNames[0]) == TypeDecl::NUMBER_OF_KINDS, "genTypeString: kindNames is out of sync." );
 	assertf( kind < TypeDecl::NUMBER_OF_KINDS, "TypeDecl kind is out of bounds." );
 	return kindNames[ kind ];