Index: doc/bibliography/pl.bib
===================================================================
--- doc/bibliography/pl.bib	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/bibliography/pl.bib	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -688,5 +688,5 @@
     title	= {Asynchronous Exception Propagation in Blocked Tasks},
     booktitle	= {4th International Workshop on Exception Handling (WEH.08)},
-    organization= {16th International Symposium on the Foundations of Software Engineering (FSE 16)},
+    optorganization= {16th International Symposium on the Foundations of Software Engineering (FSE 16)},
     address	= {Atlanta, U.S.A},
     month	= nov,
@@ -7246,5 +7246,5 @@
 
 @inproceedings{Edelson92,
-    keywords	= {persistence, pointers},
+    keywords	= {persistence, smart pointers},
     contributer	= {pabuhr@plg},
     author	= {Daniel R. Edelson},
@@ -7256,4 +7256,16 @@
     year	= 1992,
     pages	= {1-19},
+}
+
+@incollection{smartpointers,
+    keywords	= {smart pointers},
+    contributer	= {pabuhr@plg},
+    author	= {Andrei Alexandrescu},
+    title	= {Smart Pointers},
+    booktitle	= {Modern C++ Design: Generic Programming and Design Patterns Applied},
+    publisher	= {Addison-Wesley},
+    year	= 2001,
+    chapter	= 7,
+    optpages	= {?-?},
 }
 
@@ -8245,4 +8257,15 @@
 }
 
+@misc{vistorpattern,
+    keywords	= {visitor pattern},
+    contributer	= {pabuhr@plg},
+    key		= {vistor pattern},
+    title	= {vistor pattern},
+    year	= 2020,
+    note	= {WikipediA},
+    howpublished= {\href{https://en.wikipedia.org/wiki/Visitor\_pattern}
+		  {https://\-en.wikipedia.org/\-wiki/\-Visitor\_pattern}},
+}
+
 % W
 
Index: doc/theses/andrew_beach_MMath/Makefile
===================================================================
--- doc/theses/andrew_beach_MMath/Makefile	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/theses/andrew_beach_MMath/Makefile	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,5 +1,5 @@
 ### Makefile for Andrew Beach's Masters Thesis
 
-DOC=thesis.pdf
+DOC=uw-ethesis.pdf
 BUILD=out
 TEXSRC=$(wildcard *.tex)
@@ -7,5 +7,5 @@
 STYSRC=$(wildcard *.sty)
 CLSSRC=$(wildcard *.cls)
-TEXLIB= .:${BUILD}:
+TEXLIB= .:../../LaTeXmacros:${BUILD}:
 BIBLIB= .:../../bibliography
 
@@ -29,4 +29,5 @@
 	${LATEX} ${BASE}
 	${BIBTEX} ${BUILD}/${BASE}
+	${LATEX} ${BASE}
 	${GLOSSARY} ${BUILD}/${BASE}
 	${LATEX} ${BASE}
Index: doc/theses/andrew_beach_MMath/existing.tex
===================================================================
--- doc/theses/andrew_beach_MMath/existing.tex	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/theses/andrew_beach_MMath/existing.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,7 +1,10 @@
-\chapter{\CFA{} Existing Features}
+\chapter{\CFA Existing Features}
+
+\CFA (C-for-all)~\cite{Cforall} is an open-source project extending ISO C with modern safety and productivity features, while still ensuring backwards compatibility with C and its programmers.
+\CFA is designed to have an orthogonal feature-set based closely on the C programming paradigm (non-object-oriented) and these features can be added incrementally to an existing C code-base allowing programmers to learn \CFA on an as-needed basis.
 
 \section{Overloading and extern}
 Cforall has overloading, allowing multiple definitions of the same name to
-be defined.
+be defined.~\cite{Moss18}
 
 This also adds name mangling so that the assembly symbols are unique for
@@ -11,19 +14,19 @@
 
 The syntax for disabling mangling is:
-\begin{lstlisting}
+\begin{cfa}
 extern "C" {
     ...
 }
-\end{lstlisting}
+\end{cfa}
 
 To re-enable mangling once it is disabled the syntax is:
-\begin{lstlisting}
+\begin{cfa}
 extern "Cforall" {
     ...
 }
-\end{lstlisting}
+\end{cfa}
 
 Both should occur at the declaration level and effect all the declarations
-in \texttt{...}. Neither care about the state of mangling when they begin
+in @...@. Neither care about the state of mangling when they begin
 and will return to that state after the group is finished. So re-enabling
 is only used to nest areas of mangled and unmangled declarations.
@@ -31,8 +34,8 @@
 \section{References}
 \CFA adds references to C. These are auto-dereferencing pointers and use the
-same syntax as pointers except they use ampersand (\codeCFA{\&}) instead of
-the asterisk (\codeCFA{*}). They can also be constaint or mutable, if they
+same syntax as pointers except they use ampersand (@&@) instead of
+the asterisk (@*@). They can also be constaint or mutable, if they
 are mutable they may be assigned to by using the address-of operator
-(\codeCFA\&) which converts them into a pointer.
+(@&@) which converts them into a pointer.
 
 \section{Constructors and Destructors}
@@ -41,22 +44,22 @@
 functions with special names. The special names are used to define them and
 may be used to call the functions expicately. The \CFA special names are
-constructed by taking the tokens in the operators and putting \texttt{?} where
-the arguments would go. So multiplication is \texttt{?*?} while dereference
-is \texttt{*?}. This also make it easy to tell the difference between
-pre-fix operations (such as \texttt{++?}) and post-fix operations
-(\texttt{?++}).
-
-The special name for contructors is \texttt{?\{\}}, which comes from the
+constructed by taking the tokens in the operators and putting @?@ where
+the arguments would go. So multiplication is @?*?@ while dereference
+is @*?@. This also make it easy to tell the difference between
+pre-fix operations (such as @++?@) and post-fix operations
+(@?++@).
+
+The special name for contructors is @?{}@, which comes from the
 initialization syntax in C. The special name for destructors is
-\texttt{\^{}?\{\}}. % I don't like the \^{} symbol but $^\wedge$ isn't better.
+@^{}@. % I don't like the \^{} symbol but $^\wedge$ isn't better.
 
 Any time a type T goes out of scope the destructor matching
-\codeCFA{void ^?\{\}(T \&);} is called. In theory this is also true of
-primitive types such as \codeCFA{int}, but in practice those are no-ops and
+@void ^?{}(T &);@ is called. In theory this is also true of
+primitive types such as @int@, but in practice those are no-ops and
 are usually omitted for optimization.
 
 \section{Polymorphism}
 \CFA uses polymorphism to create functions and types that are defined over
-different types. \CFA polymorphic declarations serve the same role as \CPP
+different types. \CFA polymorphic declarations serve the same role as \CC
 templates or Java generics.
 
@@ -65,9 +68,9 @@
 except that you may use the names introduced by the forall clause in them.
 
-Forall clauses are written \codeCFA{forall( ... )} where \codeCFA{...} becomes
+Forall clauses are written @forall( ... )@ where @...@ becomes
 the list of polymorphic variables (local type names) and assertions, which
 repersent required operations on those types.
 
-\begin{lstlisting}
+\begin{cfa}
 forall(dtype T | { void do_once(T &); })
 void do_twice(T & value) {
@@ -75,5 +78,5 @@
     do_once(value);
 }
-\end{lstlisting}
+\end{cfa}
 
 A polymorphic function can be used in the same way normal functions are.
@@ -83,7 +86,7 @@
 the the call site.
 
-As an example, even if no function named \codeCFA{do_once} is not defined
-near the definition of \codeCFA{do_twice} the following code will work.
-\begin{lstlisting}
+As an example, even if no function named @do_once@ is not defined
+near the definition of @do_twice@ the following code will work.
+\begin{cfa}
 int quadruple(int x) {
     void do_once(int & y) {
@@ -93,10 +96,10 @@
     return x;
 }
-\end{lstlisting}
+\end{cfa}
 This is not the recommended way to implement a quadruple function but it
-does work. The complier will deduce that \codeCFA{do_twice}'s T is an
+does work. The complier will deduce that @do_twice@'s T is an
 integer from the argument. It will then look for a definition matching the
-assertion which is the \codeCFA{do_once} defined within the function. That
-function will be passed in as a function pointer to \codeCFA{do_twice} and
+assertion which is the @do_once@ defined within the function. That
+function will be passed in as a function pointer to @do_twice@ and
 called within it.
 
@@ -104,15 +107,15 @@
 traits which collect assertions into convenent packages that can then be used
 in assertion lists instead of all of their components.
-\begin{lstlisting}
+\begin{cfa}
 trait done_once(dtype T) {
     void do_once(T &);
 }
-\end{lstlisting}
+\end{cfa}
 
 After this the forall list in the previous example could instead be written
 with the trait instead of the assertion itself.
-\begin{lstlisting}
+\begin{cfa}
 forall(dtype T | done_once(T))
-\end{lstlisting}
+\end{cfa}
 
 Traits can have arbitrary number of assertions in them and are usually used to
@@ -124,5 +127,5 @@
 are now used in field declaractions instead of parameters and local variables.
 
-\begin{lstlisting}
+\begin{cfa}
 forall(dtype T)
 struct node {
@@ -130,7 +133,7 @@
     T * data;
 }
-\end{lstlisting}
-
-The \codeCFA{node(T)} is a use of a polymorphic structure. Polymorphic types
+\end{cfa}
+
+The @node(T)@ is a use of a polymorphic structure. Polymorphic types
 must be provided their polymorphic parameters.
 
@@ -140,8 +143,8 @@
 \section{Concurrency}
 
-\CFA has a number of concurrency features, \codeCFA{thread}s,
-\codeCFA{monitor}s and \codeCFA{mutex} parameters, \codeCFA{coroutine}s and
-\codeCFA{generator}s. The two features that interact with the exception system
-are \codeCFA{thread}s and \codeCFA{coroutine}s; they and their supporting
+\CFA has a number of concurrency features, @thread@s,
+@monitor@s and @mutex@ parameters, @coroutine@s and
+@generator@s. The two features that interact with the exception system
+are @thread@s and @coroutine@s; they and their supporting
 constructs will be described here.
 
@@ -154,16 +157,16 @@
 library.
 
-In \CFA coroutines are created using the \codeCFA{coroutine} keyword which
-works just like \codeCFA{struct} except that the created structure will be
-modified by the compiler to satify the \codeCFA{is_coroutine} trait.
+In \CFA coroutines are created using the @coroutine@ keyword which
+works just like @struct@ except that the created structure will be
+modified by the compiler to satify the @is_coroutine@ trait.
 
 These structures act as the interface between callers and the coroutine,
 the fields are used to pass information in and out. Here is a simple example
 where the single field is used to pass the next number in a sequence out.
-\begin{lstlisting}
+\begin{cfa}
 coroutine CountUp {
     unsigned int next;
 }
-\end{lstlisting}
+\end{cfa}
 
 The routine part of the coroutine is a main function for the coroutine. It
@@ -173,5 +176,5 @@
 function it continue from that same suspend statement instead of at the top
 of the function.
-\begin{lstlisting}
+\begin{cfa}
 void main(CountUp & this) {
     unsigned int next = 0;
@@ -182,5 +185,5 @@
     }
 }
-\end{lstlisting}
+\end{cfa}
 
 Control is passed to the coroutine with the resume function. This includes the
@@ -189,5 +192,5 @@
 return value is for easy access to communication variables. For example the
 next value from a count-up can be generated and collected in a single
-expression: \codeCFA{resume(count).next}.
+expression: @resume(count).next@.
 
 \subsection{Monitors and Mutex}
@@ -198,6 +201,6 @@
 parameters.
 
-Function parameters can have the \codeCFA{mutex} qualifiers on reference
-arguments, for example \codeCFA{void example(a_monitor & mutex arg);}. When the
+Function parameters can have the @mutex@ qualifiers on reference
+arguments, for example @void example(a_monitor & mutex arg);@. When the
 function is called it will acquire the lock on all of the mutex parameters.
 
@@ -214,5 +217,5 @@
 
 Threads are created like coroutines except the keyword is changed:
-\begin{lstlisting}
+\begin{cfa}
 thread StringWorker {
     const char * input;
@@ -225,5 +228,5 @@
     this.result = result;
 }
-\end{lstlisting}
+\end{cfa}
 The main function will start executing after the fork operation and continue
 executing until it is finished. If another thread joins with this one it will
@@ -233,5 +236,5 @@
 From the outside this is the creation and destruction of the thread object.
 Fork happens after the constructor is run and join happens before the
-destructor runs. Join also happens during the \codeCFA{join} function which
+destructor runs. Join also happens during the @join@ function which
 can be used to join a thread earlier. If it is used the destructor does not
 join as that has already been completed.
Index: doc/theses/andrew_beach_MMath/features.tex
===================================================================
--- doc/theses/andrew_beach_MMath/features.tex	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/theses/andrew_beach_MMath/features.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -54,8 +54,8 @@
 returns a reference to the virtual table instance. Defining this function
 also establishes the virtual type and virtual table pair to the resolver
-and promises that \codeCFA{exceptT} is a virtual type and a child of the
+and promises that @exceptT@ is a virtual type and a child of the
 base exception type.
 
-One odd thing about \codeCFA{get_exception_vtable} is that it should always
+One odd thing about @get_exception_vtable@ is that it should always
 be a constant function, returning the same value regardless of its argument.
 A pointer or reference to the virtual table instance could be used instead,
@@ -66,7 +66,7 @@
 
 Also note the use of the word ``promise" in the trait description. \CFA
-cannot currently check to see if either \codeCFA{exceptT} or
-\codeCFA{virtualT} match the layout requirements. Currently this is
-considered part of \codeCFA{get_exception_vtable}'s correct implementation.
+cannot currently check to see if either @exceptT@ or
+@virtualT@ match the layout requirements. Currently this is
+considered part of @get_exception_vtable@'s correct implementation.
 
 \begin{lstlisting}
@@ -92,6 +92,6 @@
 
 Finally there are three additional macros that can be used to refer to the
-these traits. They are \codeCFA{IS_EXCEPTION},
-\codeCFA{IS_TERMINATION_EXCEPTION} and \codeCFA{IS_RESUMPTION_EXCEPTION}.
+these traits. They are @IS_EXCEPTION@,
+@IS_TERMINATION_EXCEPTION@ and @IS_RESUMPTION_EXCEPTION@.
 Each takes the virtual type's name and, for polymorphic types only, the
 parenthesized list of polymorphic arguments. These do the name mangling to
@@ -113,5 +113,5 @@
 The expression must evaluate to a reference to a termination exception. A
 termination exception is any exception with a
-\codeCFA{void defaultTerminationHandler(T &);} (the default handler) defined
+@void defaultTerminationHandler(T &);@ (the default handler) defined
 on it. The handler is taken from the call sight with \CFA's trait system and
 passed into the exception system along with the exception itself.
@@ -169,5 +169,5 @@
 
 You can also re-throw the most recent termination exception with
-\codeCFA{throw;}. % This is terrible and you should never do it.
+@throw;@. % This is terrible and you should never do it.
 This can be done in a handler or any function that could be called from a
 handler.
@@ -193,5 +193,5 @@
 The result of EXPRESSION must be a resumption exception type. A resumption
 exception type is any type that satisfies the assertion
-\codeCFA{void defaultResumptionHandler(T &);} (the default handler). When the
+@void defaultResumptionHandler(T &);@ (the default handler). When the
 statement is executed the expression is evaluated and the result is thrown.
 
@@ -260,6 +260,6 @@
 \paragraph{Re-Throwing}
 
-You may also re-throw resumptions with a \codeCFA{throwResume;} statement.
-This can only be done from inside of a \codeCFA{catchResume} block.
+You may also re-throw resumptions with a @throwResume;@ statement.
+This can only be done from inside of a @catchResume@ block.
 
 Outside of any side effects of any code already run in the handler this will
@@ -269,5 +269,5 @@
 \section{Finally Clauses}
 
-A \codeCFA{finally} clause may be placed at the end of a try statement after
+A @finally@ clause may be placed at the end of a try statement after
 all the handler clauses. In the simply case, with no handlers, it looks like
 this:
@@ -294,6 +294,6 @@
 
 Because of this local control flow out of the finally block is forbidden.
-The compiler rejects uses of \codeCFA{break}, \codeCFA{continue},
-\codeCFA{fallthru} and \codeCFA{return} that would cause control to leave
+The compiler rejects uses of @break@, @continue@,
+@fallthru@ and @return@ that would cause control to leave
 the finally block. Other ways to leave the finally block - such as a long
 jump or termination - are much harder to check, at best requiring additional
@@ -307,5 +307,5 @@
 
 There is no special statement for starting a cancellation, instead you call
-the standard library function \codeCFA{cancel\_stack} which takes an exception.
+the standard library function @cancel\_stack@ which takes an exception.
 Unlike in a throw this exception is not used in control flow but is just there
 to pass information about why the cancellation happened.
@@ -323,6 +323,6 @@
 
 \item Thread Stack:
-Thread stacks are those created \codeCFA{thread} or otherwise satisfy the
-\codeCFA{is\_thread} trait.
+Thread stacks are those created @thread@ or otherwise satisfy the
+@is\_thread@ trait.
 
 Threads only have two structural points of communication that must happen,
@@ -333,9 +333,9 @@
 and wait for another thread to join with it. The other thread, when it joins,
 checks for a cancellation. If so it will throw the resumption exception
-\codeCFA{ThreadCancelled}.
-
-There is a difference here in how explicate joins (with the \codeCFA{join}
+@ThreadCancelled@.
+
+There is a difference here in how explicate joins (with the @join@
 function) and implicate joins (from a destructor call). Explicate joins will
-take the default handler (\codeCFA{defaultResumptionHandler}) from the context
+take the default handler (@defaultResumptionHandler@) from the context
 and use like a regular through does if the exception is not caught. The
 implicate join does a program abort instead.
@@ -349,6 +349,6 @@
 
 \item Coroutine Stack:
-Coroutine stacks are those created with \codeCFA{coroutine} or otherwise
-satisfy the \codeCFA{is\_coroutine} trait.
+Coroutine stacks are those created with @coroutine@ or otherwise
+satisfy the @is\_coroutine@ trait.
 
 A coroutine knows of two other coroutines, its starter and its last resumer.
@@ -356,8 +356,8 @@
 
 After the stack is unwound control goes to the last resumer.
-Resume will resume throw a \codeCFA{CoroutineCancelled} exception, which is
+Resume will resume throw a @CoroutineCancelled@ exception, which is
 polymorphic over the coroutine type and has a pointer to the coroutine being
 canceled and the canceling exception. The resume function also has an
-assertion that the \codeCFA{defaultResumptionHandler} for the exception. So it
+assertion that the @defaultResumptionHandler@ for the exception. So it
 will use the default handler like a regular throw.
 
Index: doc/theses/andrew_beach_MMath/future.tex
===================================================================
--- doc/theses/andrew_beach_MMath/future.tex	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/theses/andrew_beach_MMath/future.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -20,6 +20,6 @@
 
 \section{Additional Throws}
-Several other kinds of throws, beyond the termination throw (\codeCFA{throw}),
-the resumption throw (\codeCFA{throwResume}) and the re-throws, were considered.
+Several other kinds of throws, beyond the termination throw (@throw@),
+the resumption throw (@throwResume@) and the re-throws, were considered.
 None were as useful as the core throws but they would likely be worth
 revising.
@@ -114,5 +114,5 @@
 is no reason not to allow it. It is however a small improvement; giving a bit
 of flexibility to the user in what style they want to use.
-\item Enabling local control flow (by \codeCFA{break}, \codeCFA{return} and
+\item Enabling local control flow (by @break@, @return@ and
 similar statements) out of a termination handler. The current set-up makes
 this very difficult but the catch function that runs the handler after it has
Index: doc/theses/andrew_beach_MMath/implement.tex
===================================================================
--- doc/theses/andrew_beach_MMath/implement.tex	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/theses/andrew_beach_MMath/implement.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -9,5 +9,5 @@
 
 All of this is accessed through a field inserted at the beginning of every
-virtual type. Currently it is called \codeC{virtual_table} but it is not
+virtual type. Currently it is called @virtual_table@ but it is not
 ment to be accessed by the user. This field is a pointer to the type's
 virtual table instance. It is assigned once during the object's construction
@@ -40,6 +40,6 @@
 using that to calculate the mangled name of the parent's virtual table type.
 There are two special fields that are included like normal fields but have
-special initialization rules: the \codeC{size} field is the type's size and is
-initialized with a sizeof expression, the \codeC{align} field is the type's
+special initialization rules: the @size@ field is the type's size and is
+initialized with a sizeof expression, the @align@ field is the type's
 alignment and uses an alignof expression. The remaining fields are resolved
 to a name matching the field's name and type using the normal visibility
@@ -56,5 +56,5 @@
 The declarations include the virtual type definition and forward declarations
 of the virtual table instance, constructor, message function and
-\codeCFA{get_exception_vtable}. The definition includes the storage and
+@get_exception_vtable@. The definition includes the storage and
 initialization of the virtual table instance and the bodies of the three
 functions.
@@ -65,9 +65,9 @@
 from the per-instance information. The virtual table type and most of the
 functions are polymorphic so they are all part of the core. The virtual table
-instance and the \codeCFA{get_exception_vtable} function.
-
-Coroutines and threads need instances of \codeCFA{CoroutineCancelled} and
-\codeCFA{ThreadCancelled} respectively to use all of their functionality.
-When a new data type is declared with \codeCFA{coroutine} or \codeCFA{thread}
+instance and the @get_exception_vtable@ function.
+
+Coroutines and threads need instances of @CoroutineCancelled@ and
+@ThreadCancelled@ respectively to use all of their functionality.
+When a new data type is declared with @coroutine@ or @thread@
 the forward declaration for the instance is created as well. The definition
 of the virtual table is created at the definition of the main function.
@@ -79,5 +79,5 @@
 function.
 
-The function is \codeC{__cfa__virtual_cast} and it is implemented in the
+The function is @__cfa__virtual_cast@ and it is implemented in the
 standard library. It takes a pointer to the target type's virtual table and
 the object pointer being cast. The function is very simple, getting the
@@ -87,5 +87,5 @@
 
 For the generated code a forward decaration of the virtual works as follows.
-There is a forward declaration of \codeC{__cfa__virtual_cast} in every cfa
+There is a forward declaration of @__cfa__virtual_cast@ in every cfa
 file so it can just be used. The object argument is the expression being cast
 so that is just placed in the argument list.
@@ -110,5 +110,5 @@
 often across functions.
 
-At a very basic level this can be done with \codeC{setjmp} \& \codeC{longjmp}
+At a very basic level this can be done with @setjmp@ \& @longjmp@
 which simply move the top of the stack, discarding everything on the stack
 above a certain point. However this ignores all the clean-up code that should
@@ -118,5 +118,5 @@
 both of these problems.
 
-Libunwind, provided in \texttt{unwind.h} on most platorms, is a C library
+Libunwind, provided in @unwind.h@ on most platorms, is a C library
 that provides \CPP style stack unwinding. Its operation is divided into two
 phases. The search phase -- phase 1 -- is used to scan the stack and decide
@@ -142,5 +142,5 @@
 
 GCC will generate an LSDA and attach its personality function with the
-\texttt{-fexceptions} flag. However this only handles the cleanup attribute.
+@-fexceptions@ flag. However this only handles the cleanup attribute.
 This attribute is used on a variable and specifies a function that should be
 run when the variable goes out of scope. The function is passed a pointer to
@@ -165,26 +165,26 @@
 messages for special cases (some of which should never be used by the
 personality function) and error codes but unless otherwise noted the
-personality function should always return \codeC{_URC_CONTINUE_UNWIND}.
-
-The \codeC{version} argument is the verson of the implementation that is
+personality function should always return @_URC_CONTINUE_UNWIND@.
+
+The @version@ argument is the verson of the implementation that is
 calling the personality function. At this point it appears to always be 1 and
 it will likely stay that way until a new version of the API is updated.
 
-The \codeC{action} argument is set of flags that tell the personality
+The @action@ argument is set of flags that tell the personality
 function when it is being called and what it must do on this invocation.
 The flags are as follows:
 \begin{itemize}
-\item\codeC{_UA_SEARCH_PHASE}: This flag is set whenever the personality
+\item@_UA_SEARCH_PHASE@: This flag is set whenever the personality
 function is called during the search phase. The personality function should
 decide if unwinding will stop in this function or not. If it does then the
-personality function should return \codeC{_URC_HANDLER_FOUND}.
-\item\codeC{_UA_CLEANUP_PHASE}: This flag is set whenever the personality
+personality function should return @_URC_HANDLER_FOUND@.
+\item@_UA_CLEANUP_PHASE@: This flag is set whenever the personality
 function is called during the cleanup phase. If no other flags are set this
 means the entire frame will be unwound and all cleanup code should be run.
-\item\codeC{_UA_HANDLER_FRAME}: This flag is set during the cleanup phase
+\item@_UA_HANDLER_FRAME@: This flag is set during the cleanup phase
 on the function frame that found the handler. The personality function must
 prepare to return to normal code execution and return
-\codeC{_URC_INSTALL_CONTEXT}.
-\item\codeC{_UA_FORCE_UNWIND}: This flag is set if the personality function
+@_URC_INSTALL_CONTEXT@.
+\item@_UA_FORCE_UNWIND@: This flag is set if the personality function
 is called through a forced unwind call. Forced unwind only performs the
 cleanup phase and uses a different means to decide when to stop. See its
@@ -192,8 +192,8 @@
 \end{itemize}
 
-The \codeC{exception_class} argument is a copy of the \codeC{exception}'s
-\codeC{exception_class} field.
-
-The \codeC{exception} argument is a pointer to the user provided storage
+The @exception_class@ argument is a copy of the @exception@'s
+@exception_class@ field.
+
+The @exception@ argument is a pointer to the user provided storage
 object. It has two public fields, the exception class which is actually just
 a number that identifies the exception handling mechanism that created it and
@@ -201,5 +201,5 @@
 exception needs to 
 
-The \codeC{context} argument is a pointer to an opaque type. This is passed
+The @context@ argument is a pointer to an opaque type. This is passed
 to the many helper functions that can be called inside the personality
 function.
@@ -218,5 +218,5 @@
 functions traversing the stack new-to-old until a function finds a handler or
 the end of the stack is reached. In the latter case raise exception will
-return with \codeC{_URC_END_OF_STACK}.
+return with @_URC_END_OF_STACK@.
 
 Once a handler has been found raise exception continues onto the the cleanup
@@ -227,5 +227,5 @@
 
 If an error is encountered raise exception will return either
-\codeC{_URC_FATAL_PHASE1_ERROR} or \codeC{_URC_FATAL_PHASE2_ERROR} depending
+@_URC_FATAL_PHASE1_ERROR@ or @_URC_FATAL_PHASE2_ERROR@ depending
 on when the error occured.
 
@@ -259,13 +259,13 @@
 been unwound.
 
-Each time it is called the stop function should return \codeC{_URC_NO_REASON}
+Each time it is called the stop function should return @_URC_NO_REASON@
 or transfer control directly to other code outside of libunwind. The
 framework does not provide any assistance here.
 
 Its arguments are the same as the paired personality function.
-The actions \codeC{_UA_CLEANUP_PHASE} and \codeC{_UA_FORCE_UNWIND} are always
+The actions @_UA_CLEANUP_PHASE@ and @_UA_FORCE_UNWIND@ are always
 set when it is called. By the official standard that is all but both GCC and
 Clang add an extra action on the last call at the end of the stack:
-\codeC{_UA_END_OF_STACK}.
+@_UA_END_OF_STACK@.
 
 \section{Exception Context}
@@ -280,23 +280,23 @@
 Each stack has its own exception context. In a purely sequental program, using
 only core Cforall, there is only one stack and the context is global. However
-if the library \texttt{libcfathread} is linked then there can be multiple
+if the library @libcfathread@ is linked then there can be multiple
 stacks so they will each need their own.
 
 To handle this code always gets the exception context from the function
-\codeC{this_exception_context}. The main exception handling code is in
-\texttt{libcfa} and that library also defines the function as a weak symbol
-so it acts as a default. Meanwhile in \texttt{libcfathread} the function is
+@this_exception_context@. The main exception handling code is in
+@libcfa@ and that library also defines the function as a weak symbol
+so it acts as a default. Meanwhile in @libcfathread@ the function is
 defined as a strong symbol that replaces it when the libraries are linked
 together.
 
-The version of the function defined in \texttt{libcfa} is very simple. It
+The version of the function defined in @libcfa@ is very simple. It
 returns a pointer to a global static variable. With only one stack this
 global instance is associated with the only stack.
 
-The version of the function defined in \texttt{libcfathread} has to handle
+The version of the function defined in @libcfathread@ has to handle
 more as there are multiple stacks. The exception context is included as
 part of the per-stack data stored as part of coroutines. In the cold data
 section, stored at the base of each stack, is the exception context for that
-stack. The \codeC{this_exception_context} uses the concurrency library to get
+stack. The @this_exception_context@ uses the concurrency library to get
 the current coroutine and through it the cold data section and the exception
 context.
@@ -323,5 +323,5 @@
 to store the exception. Macros with pointer arthritic and type cast are
 used to move between the components or go from the embedded
-\codeC{_Unwind_Exception} to the entire node.
+@_Unwind_Exception@ to the entire node.
 
 All of these nodes are strung together in a linked list. One linked list per
@@ -347,5 +347,5 @@
 C which is what the \CFA compiler outputs so a work-around is used.
 
-This work around is a function called \codeC{__cfaehm_try_terminate} in the
+This work around is a function called @__cfaehm_try_terminate@ in the
 standard library. The contents of a try block and the termination handlers
 are converted into functions. These are then passed to the try terminate
@@ -385,6 +385,6 @@
 
 These nested functions and all other functions besides
-\codeC{__cfaehm_try_terminate} in \CFA use the GCC personality function and
-the \texttt{-fexceptions} flag to generate the LSDA. This allows destructors
+@__cfaehm_try_terminate@ in \CFA use the GCC personality function and
+the @-fexceptions@ flag to generate the LSDA. This allows destructors
 to be implemented with the cleanup attribute.
 
@@ -401,9 +401,25 @@
 
 The handler function does both the matching and catching. It tries each
-the condition of \codeCFA{catchResume} in order, top-to-bottom and until it
+the condition of @catchResume@ in order, top-to-bottom and until it
 finds a handler that matches. If no handler matches then the function returns
 false. Otherwise the matching handler is run, if it completes successfully
-the function returns true. Rethrows, through the \codeCFA{throwResume;}
+the function returns true. Rethrows, through the @throwResume;@
 statement, cause the function to return true.
+
+% Recursive Resumption Stuff:
+Blocking out part of the stack is accomplished by updating the front of the
+list as the search continues. Before the handler at a node is called the head
+of the list is updated to the next node of the current node. After the search
+is complete, successful or not, the head of the list is reset.
+
+This means the current handler and every handler that has already been
+checked are not on the list while a handler is run. If a resumption is thrown
+during the handling of another resumption the active handlers and all the
+other handler checked up to this point will not be checked again.
+
+This structure also supports new handler added while the resumption is being
+handled. These are added to the front of the list, pointing back along the
+stack -- the first one will point over all the checked handlers -- and the
+ordering is maintained.
 
 \subsection{Libunwind Compatibility}
@@ -438,5 +454,5 @@
 
 Cancellation also uses libunwind to do its stack traversal and unwinding,
-however it uses a different primary function \codeC{_Unwind_ForcedUnwind}.
+however it uses a different primary function @_Unwind_ForcedUnwind@.
 Details of its interface can be found in the unwind section.
 
Index: doc/theses/andrew_beach_MMath/unwinding.tex
===================================================================
--- doc/theses/andrew_beach_MMath/unwinding.tex	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/theses/andrew_beach_MMath/unwinding.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -10,7 +10,7 @@
 Even this is fairly simple if nothing needs to happen when the stack unwinds.
 Traditional C can unwind the stack by saving and restoring state (with
-\codeC{setjmp} \& \codeC{longjmp}). However many languages define actions that
+@setjmp@ \& @longjmp@). However many languages define actions that
 have to be taken when something is removed from the stack, such as running
-a variable's destructor or a \codeCFA{try} statement's \codeCFA{finally}
+a variable's destructor or a @try@ statement's @finally@
 clause. Handling this requires walking the stack going through each stack
 frame.
@@ -29,6 +29,6 @@
 
 \CFA uses two primary functions in libunwind to create most of its
-exceptional control-flow: \codeC{_Unwind_RaiseException} and
-\codeC{_Unwind_ForcedUnwind}.
+exceptional control-flow: @_Unwind_RaiseException@ and
+@_Unwind_ForcedUnwind@.
 Their operation is divided into two phases: search and clean-up. The search
 phase -- phase 1 -- is used to scan the stack but not unwinding it. The
@@ -44,25 +44,25 @@
 A personality function performs three tasks, although not all have to be
 present. The tasks performed are decided by the actions provided.
-\codeC{_Unwind_Action} is a bitmask of possible actions and an argument of
+@_Unwind_Action@ is a bitmask of possible actions and an argument of
 this type is passed into the personality function.
 \begin{itemize}
-\item\codeC{_UA_SEARCH_PHASE} is passed in search phase and tells the
+\item@_UA_SEARCH_PHASE@ is passed in search phase and tells the
 personality function to check for handlers. If there is a handler in this
 stack frame, as defined by the language, the personality function should
-return \codeC{_URC_HANDLER_FOUND}. Otherwise it should return
-\codeC{_URC_CONTINUE_UNWIND}.
-\item\codeC{_UA_CLEANUP_PHASE} is passed in during the clean-up phase and
+return @_URC_HANDLER_FOUND@. Otherwise it should return
+@_URC_CONTINUE_UNWIND@.
+\item@_UA_CLEANUP_PHASE@ is passed in during the clean-up phase and
 means part or all of the stack frame is removed. The personality function
 should do whatever clean-up the language defines
 (such as running destructors/finalizers) and then generally returns
-\codeC{_URC_CONTINUE_UNWIND}.
-\item\codeC{_UA_HANDLER_FRAME} means the personality function must install
+@_URC_CONTINUE_UNWIND@.
+\item@_UA_HANDLER_FRAME@ means the personality function must install
 a handler. It is also passed in during the clean-up phase and is in addition
 to the clean-up action. libunwind provides several helpers for the personality
 function here. Once it is done, the personality function must return
-\codeC{_URC_INSTALL_CONTEXT}.
+@_URC_INSTALL_CONTEXT@.
 \end{itemize}
 The personality function is given a number of other arguments. Some are for
-compatability and there is the \codeC{struct _Unwind_Context} pointer which
+compatability and there is the @struct _Unwind_Context@ pointer which
 passed to many helpers to get information about the current stack frame.
 
@@ -72,5 +72,5 @@
 raise-exception but with some extras.
 The first it passes in an extra action to the personality function on each
-stack frame, \codeC{_UA_FORCE_UNWIND}, which means a handler cannot be
+stack frame, @_UA_FORCE_UNWIND@, which means a handler cannot be
 installed.
 
@@ -83,8 +83,8 @@
 stack frames have been removed. By the standard API this is marked by setting
 the stack pointer inside the context passed to the stop function. However both
-GCC and Clang add an extra action for this case \codeC{_UA_END_OF_STACK}.
+GCC and Clang add an extra action for this case @_UA_END_OF_STACK@.
 
 Each time function the stop function is called it can do one or two things.
-When it is not the end of the stack it can return \codeC{_URC_NO_REASON} to
+When it is not the end of the stack it can return @_URC_NO_REASON@ to
 continue unwinding.
 % Is there a reason that NO_REASON is used instead of CONTINUE_UNWIND?
@@ -113,5 +113,5 @@
 
 The stop function is very simple. It checks the end of stack flag to see if
-it is finished unwinding. If so, it calls \codeC{exit} to end the process,
+it is finished unwinding. If so, it calls @exit@ to end the process,
 otherwise it returns with no-reason to continue unwinding.
 % Yeah, this is going to have to change.
@@ -128,18 +128,18 @@
 location of the instruction pointer and stack layout, which varies with
 compiler and optimization levels. So for frames where there are only
-destructors, GCC's attribute cleanup with the \texttt{-fexception} flag is
+destructors, GCC's attribute cleanup with the @-fexception@ flag is
 sufficient to handle unwinding.
 
 The only functions that require more than that are those that contain
-\codeCFA{try} statements. A \codeCFA{try} statement has a \codeCFA{try}
-clause, some number of \codeCFA{catch} clauses and \codeCFA{catchResume}
-clauses and may have a \codeCFA{finally} clause. Of these only \codeCFA{try}
-statements with \codeCFA{catch} clauses need to be transformed and only they
-and the \codeCFA{try} clause are involved.
+@try@ statements. A @try@ statement has a @try@
+clause, some number of @catch@ clauses and @catchResume@
+clauses and may have a @finally@ clause. Of these only @try@
+statements with @catch@ clauses need to be transformed and only they
+and the @try@ clause are involved.
 
-The \codeCFA{try} statement is converted into a series of closures which can
+The @try@ statement is converted into a series of closures which can
 access other parts of the function according to scoping rules but can be
-passed around. The \codeCFA{try} clause is converted into the try functions,
-almost entirely unchanged. The \codeCFA{catch} clauses are converted into two
+passed around. The @try@ clause is converted into the try functions,
+almost entirely unchanged. The @catch@ clauses are converted into two
 functions; the match function and the catch function.
 
@@ -153,5 +153,5 @@
 runs the handler's body.
 
-These three functions are passed to \codeC{try_terminate}. This is an
+These three functions are passed to @try_terminate@. This is an
 % Maybe I shouldn't quote that, it isn't its actual name.
 internal hand-written function that has its own personality function and
@@ -167,7 +167,7 @@
 handler was found in this frame. If it was then the personality function
 installs the handler, which is setting the instruction pointer in
-\codeC{try_terminate} to an otherwise unused section that calls the catch
+@try_terminate@ to an otherwise unused section that calls the catch
 function, passing it the current exception and handler index.
-\codeC{try_terminate} returns as soon as the catch function returns.
+@try_terminate@ returns as soon as the catch function returns.
 
 At this point control has returned to normal control flow.
Index: doc/theses/andrew_beach_MMath/uw-ethesis-frontpgs.tex
===================================================================
--- doc/theses/andrew_beach_MMath/uw-ethesis-frontpgs.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
+++ doc/theses/andrew_beach_MMath/uw-ethesis-frontpgs.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -0,0 +1,173 @@
+% T I T L E   P A G E
+% -------------------
+% Last updated October 23, 2020, by Stephen Carr, IST-Client Services
+% The title page is counted as page `i' but we need to suppress the
+% page number. Also, we don't want any headers or footers.
+\pagestyle{empty}
+\pagenumbering{roman}
+
+% The contents of the title page are specified in the "titlepage"
+% environment.
+\begin{titlepage}
+        \begin{center}
+        \vspace*{1.0cm}
+
+        \Huge
+        {\bf Exception Handling in \CFA}
+
+        \vspace*{1.0cm}
+
+        \normalsize
+        by \\
+
+        \vspace*{1.0cm}
+
+        \Large
+        Andrew James Beach \\
+
+        \vspace*{3.0cm}
+
+        \normalsize
+        A thesis \\
+        presented to the University of Waterloo \\ 
+        in fulfillment of the \\
+        thesis requirement for the degree of \\
+        Master of Mathematics \\
+        in \\
+        Computer Science \\
+
+        \vspace*{2.0cm}
+
+        Waterloo, Ontario, Canada, \the\year \\
+
+        \vspace*{1.0cm}
+
+        \copyright\ Andrew James Beach \the\year \\
+        \end{center}
+\end{titlepage}
+
+% The rest of the front pages should contain no headers and be numbered using Roman numerals starting with `ii'
+\pagestyle{plain}
+\setcounter{page}{2}
+
+\cleardoublepage % Ends the current page and causes all figures and tables that have so far appeared in the input to be printed.
+% In a two-sided printing style, it also makes the next page a right-hand (odd-numbered) page, producing a blank page if necessary.
+
+\begin{comment} 
+% E X A M I N I N G   C O M M I T T E E (Required for Ph.D. theses only)
+% Remove or comment out the lines below to remove this page
+\begin{center}\textbf{Examining Committee Membership}\end{center}
+  \noindent
+The following served on the Examining Committee for this thesis. The decision of the Examining Committee is by majority vote.
+  \bigskip
+  
+  \noindent
+\begin{tabbing}
+Internal-External Member: \=  \kill % using longest text to define tab length
+External Examiner: \>  Bruce Bruce \\ 
+\> Professor, Dept. of Philosophy of Zoology, University of Wallamaloo \\
+\end{tabbing} 
+  \bigskip
+  
+  \noindent
+\begin{tabbing}
+Internal-External Member: \=  \kill % using longest text to define tab length
+Supervisor(s): \> Ann Elk \\
+\> Professor, Dept. of Zoology, University of Waterloo \\
+\> Andrea Anaconda \\
+\> Professor Emeritus, Dept. of Zoology, University of Waterloo \\
+\end{tabbing}
+  \bigskip
+  
+  \noindent
+  \begin{tabbing}
+Internal-External Member: \=  \kill % using longest text to define tab length
+Internal Member: \> Pamela Python \\
+\> Professor, Dept. of Zoology, University of Waterloo \\
+\end{tabbing}
+  \bigskip
+  
+  \noindent
+\begin{tabbing}
+Internal-External Member: \=  \kill % using longest text to define tab length
+Internal-External Member: \> Meta Meta \\
+\> Professor, Dept. of Philosophy, University of Waterloo \\
+\end{tabbing}
+  \bigskip
+  
+  \noindent
+\begin{tabbing}
+Internal-External Member: \=  \kill % using longest text to define tab length
+Other Member(s): \> Leeping Fang \\
+\> Professor, Dept. of Fine Art, University of Waterloo \\
+\end{tabbing}
+
+\cleardoublepage
+\end{comment}
+
+% D E C L A R A T I O N   P A G E
+% -------------------------------
+  % The following is a sample Delaration Page as provided by the GSO
+  % December 13th, 2006.  It is designed for an electronic thesis.
+ \begin{center}\textbf{Author's Declaration}\end{center}
+  
+ \noindent
+I hereby declare that I am the sole author of this thesis. This is a true copy of the thesis, including any required final revisions, as accepted by my examiners.
+
+  \bigskip
+  
+  \noindent
+I understand that my thesis may be made electronically available to the public.
+
+\cleardoublepage
+
+% A B S T R A C T
+% ---------------
+
+\begin{center}\textbf{Abstract}\end{center}
+
+This is the abstract.
+
+\cleardoublepage
+
+% A C K N O W L E D G E M E N T S
+% -------------------------------
+
+\begin{center}\textbf{Acknowledgements}\end{center}
+
+I would like to thank all the little people who made this thesis possible.
+\cleardoublepage
+
+\begin{comment}
+% D E D I C A T I O N
+% -------------------
+
+\begin{center}\textbf{Dedication}\end{center}
+
+This is dedicated to the one I love.
+\cleardoublepage
+\end{comment}
+
+% T A B L E   O F   C O N T E N T S
+% ---------------------------------
+\renewcommand\contentsname{Table of Contents}
+\tableofcontents
+\cleardoublepage
+\phantomsection    % allows hyperref to link to the correct page
+
+% L I S T   O F   F I G U R E S
+% -----------------------------
+\addcontentsline{toc}{chapter}{List of Figures}
+\listoffigures
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
+
+% L I S T   O F   T A B L E S
+% ---------------------------
+\addcontentsline{toc}{chapter}{List of Tables}
+\listoftables
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
+
+% Change page numbering back to Arabic numerals
+\pagenumbering{arabic}
Index: doc/theses/andrew_beach_MMath/uw-ethesis.bib
===================================================================
--- doc/theses/andrew_beach_MMath/uw-ethesis.bib	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
+++ doc/theses/andrew_beach_MMath/uw-ethesis.bib	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -0,0 +1,28 @@
+% Bibliography of key references for "LaTeX for Thesis and Large Documents"
+% For use with BibTeX
+
+@book{goossens.book,
+	author =	"Michel Goossens and Frank Mittelbach and 
+			 Alexander Samarin",
+	title =		"The \LaTeX\ Companion",
+	year = 		"1994",
+	publisher =	"Addison-Wesley",
+	address = 	"Reading, Massachusetts"
+}
+
+@book{knuth.book,
+        author =        "Donald Knuth",
+        title =         "The \TeX book",
+        year =          "1986",
+        publisher =     "Addison-Wesley",
+        address =       "Reading, Massachusetts"
+}
+
+@book{lamport.book,
+	author =        "Leslie Lamport",
+	title =         "\LaTeX\ --- A Document Preparation System",
+        edition =       "Second",
+	year = 		"1994",
+	publisher = 	"Addison-Wesley",
+	address =       "Reading, Massachusetts"
+}
Index: doc/theses/andrew_beach_MMath/uw-ethesis.tex
===================================================================
--- doc/theses/andrew_beach_MMath/uw-ethesis.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
+++ doc/theses/andrew_beach_MMath/uw-ethesis.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -0,0 +1,240 @@
+%======================================================================
+% University of Waterloo Thesis Template for LaTeX 
+% Last Updated November, 2020 
+% by Stephen Carr, IST Client Services, 
+% University of Waterloo, 200 University Ave. W., Waterloo, Ontario, Canada
+% FOR ASSISTANCE, please send mail to request@uwaterloo.ca
+
+% DISCLAIMER
+% To the best of our knowledge, this template satisfies the current uWaterloo thesis requirements.
+% However, it is your responsibility to assure that you have met all requirements of the University and your particular department.
+
+% Many thanks for the feedback from many graduates who assisted the development of this template.
+% Also note that there are explanatory comments and tips throughout this template.
+%======================================================================
+% Some important notes on using this template and making it your own...
+
+% The University of Waterloo has required electronic thesis submission since October 2006. 
+% See the uWaterloo thesis regulations at
+% https://uwaterloo.ca/graduate-studies/thesis.
+% This thesis template is geared towards generating a PDF version optimized for viewing on an electronic display, including hyperlinks within the PDF.
+
+% DON'T FORGET TO ADD YOUR OWN NAME AND TITLE in the "hyperref" package configuration below. 
+% THIS INFORMATION GETS EMBEDDED IN THE PDF FINAL PDF DOCUMENT.
+% You can view the information if you view properties of the PDF document.
+
+% Many faculties/departments also require one or more printed copies. 
+% This template attempts to satisfy both types of output. 
+% See additional notes below.
+% It is based on the standard "book" document class which provides all necessary sectioning structures and allows multi-part theses.
+
+% If you are using this template in Overleaf (cloud-based collaboration service), then it is automatically processed and previewed for you as you edit.
+
+% For people who prefer to install their own LaTeX distributions on their own computers, and process the source files manually, the following notes provide the sequence of tasks:
+ 
+% E.g. to process a thesis called "mythesis.tex" based on this template, run:
+
+% pdflatex mythesis	-- first pass of the pdflatex processor
+% bibtex mythesis	-- generates bibliography from .bib data file(s)
+% makeindex         -- should be run only if an index is used 
+% pdflatex mythesis	-- fixes numbering in cross-references, bibliographic references, glossaries, index, etc.
+% pdflatex mythesis	-- it takes a couple of passes to completely process all cross-references
+
+% If you use the recommended LaTeX editor, Texmaker, you would open the mythesis.tex file, then click the PDFLaTeX button. Then run BibTeX (under the Tools menu).
+% Then click the PDFLaTeX button two more times. 
+% If you have an index as well,you'll need to run MakeIndex from the Tools menu as well, before running pdflatex
+% the last two times.
+
+% N.B. The "pdftex" program allows graphics in the following formats to be included with the "\includegraphics" command: PNG, PDF, JPEG, TIFF
+% Tip: Generate your figures and photos in the size you want them to appear in your thesis, rather than scaling them with \includegraphics options.
+% Tip: Any drawings you do should be in scalable vector graphic formats: SVG, PNG, WMF, EPS and then converted to PNG or PDF, so they are scalable in the final PDF as well.
+% Tip: Photographs should be cropped and compressed so as not to be too large.
+
+% To create a PDF output that is optimized for double-sided printing: 
+% 1) comment-out the \documentclass statement in the preamble below, and un-comment the second \documentclass line.
+% 2) change the value assigned below to the boolean variable "PrintVersion" from " false" to "true".
+
+%======================================================================
+%   D O C U M E N T   P R E A M B L E
+% Specify the document class, default style attributes, and page dimensions, etc.
+% For hyperlinked PDF, suitable for viewing on a computer, use this:
+\documentclass[letterpaper,12pt,titlepage,oneside,final]{book}
+
+% For PDF, suitable for double-sided printing, change the PrintVersion variable below to "true" and use this \documentclass line instead of the one above:
+%\documentclass[letterpaper,12pt,titlepage,openright,twoside,final]{book}
+
+% Some LaTeX commands I define for my own nomenclature.
+% If you have to, it's easier to make changes to nomenclature once here than in a million places throughout your thesis!
+\newcommand{\package}[1]{\textbf{#1}} % package names in bold text
+\newcommand{\cmmd}[1]{\textbackslash\texttt{#1}} % command name in tt font 
+\newcommand{\href}[1]{#1} % does nothing, but defines the command so the print-optimized version will ignore \href tags (redefined by hyperref pkg).
+%\newcommand{\texorpdfstring}[2]{#1} % does nothing, but defines the command
+% Anything defined here may be redefined by packages added below...
+
+% This package allows if-then-else control structures.
+\usepackage{ifthen}
+\newboolean{PrintVersion}
+\setboolean{PrintVersion}{false}
+% CHANGE THIS VALUE TO "true" as necessary, to improve printed results for hard copies by overriding some options of the hyperref package, called below.
+
+%\usepackage{nomencl} % For a nomenclature (optional; available from ctan.org)
+\usepackage{amsmath,amssymb,amstext} % Lots of math symbols and environments
+\usepackage[pdftex]{graphicx} % For including graphics N.B. pdftex graphics driver 
+
+% Hyperlinks make it very easy to navigate an electronic document.
+% In addition, this is where you should specify the thesis title and author as they appear in the properties of the PDF document.
+% Use the "hyperref" package
+% N.B. HYPERREF MUST BE THE LAST PACKAGE LOADED; ADD ADDITIONAL PKGS ABOVE
+\usepackage[pdftex,pagebackref=true]{hyperref} % with basic options
+%\usepackage[pdftex,pagebackref=true]{hyperref}
+		% N.B. pagebackref=true provides links back from the References to the body text. This can cause trouble for printing.
+\hypersetup{
+    plainpages=false,       % needed if Roman numbers in frontpages
+    unicode=false,          % non-Latin characters in Acrobat’s bookmarks
+    pdftoolbar=true,        % show Acrobat’s toolbar?
+    pdfmenubar=true,        % show Acrobat’s menu?
+    pdffitwindow=false,     % window fit to page when opened
+    pdfstartview={FitH},    % fits the width of the page to the window
+%    pdftitle={uWaterloo\ LaTeX\ Thesis\ Template},    % title: CHANGE THIS TEXT!
+%    pdfauthor={Author},    % author: CHANGE THIS TEXT! and uncomment this line
+%    pdfsubject={Subject},  % subject: CHANGE THIS TEXT! and uncomment this line
+%    pdfkeywords={keyword1} {key2} {key3}, % list of keywords, and uncomment this line if desired
+    pdfnewwindow=true,      % links in new window
+    colorlinks=true,        % false: boxed links; true: colored links
+    linkcolor=blue,         % color of internal links
+    citecolor=green,        % color of links to bibliography
+    filecolor=magenta,      % color of file links
+    urlcolor=cyan           % color of external links
+}
+\ifthenelse{\boolean{PrintVersion}}{   % for improved print quality, change some hyperref options
+\hypersetup{	% override some previously defined hyperref options
+%    colorlinks,%
+    citecolor=black,%
+    filecolor=black,%
+    linkcolor=black,%
+    urlcolor=black}
+}{} % end of ifthenelse (no else)
+
+\usepackage[automake,toc,abbreviations]{glossaries-extra} % Exception to the rule of hyperref being the last add-on package
+% If glossaries-extra is not in your LaTeX distribution, get it from CTAN (http://ctan.org/pkg/glossaries-extra),
+% although it's supposed to be in both the TeX Live and MikTeX distributions. There are also documentation and 
+% installation instructions there.
+
+% Setting up the page margins...
+\setlength{\textheight}{9in}\setlength{\topmargin}{-0.45in}\setlength{\headsep}{0.25in}
+% uWaterloo thesis requirements specify a minimum of 1 inch (72pt) margin at the
+% top, bottom, and outside page edges and a 1.125 in. (81pt) gutter margin (on binding side). 
+% While this is not an issue for electronic viewing, a PDF may be printed, and so we have the same page layout for both printed and electronic versions, we leave the gutter margin in.
+% Set margins to minimum permitted by uWaterloo thesis regulations:
+\setlength{\marginparwidth}{0pt} % width of margin notes
+% N.B. If margin notes are used, you must adjust \textwidth, \marginparwidth
+% and \marginparsep so that the space left between the margin notes and page
+% edge is less than 15 mm (0.6 in.)
+\setlength{\marginparsep}{0pt} % width of space between body text and margin notes
+\setlength{\evensidemargin}{0.125in} % Adds 1/8 in. to binding side of all
+% even-numbered pages when the "twoside" printing option is selected
+\setlength{\oddsidemargin}{0.125in} % Adds 1/8 in. to the left of all pages when "oneside" printing is selected, and to the left of all odd-numbered pages when "twoside" printing is selected
+\setlength{\textwidth}{6.375in} % assuming US letter paper (8.5 in. x 11 in.) and side margins as above
+\raggedbottom
+
+% The following statement specifies the amount of space between paragraphs. Other reasonable specifications are \bigskipamount and \smallskipamount.
+\setlength{\parskip}{\medskipamount}
+
+% The following statement controls the line spacing.  
+% The default spacing corresponds to good typographic conventions and only slight changes (e.g., perhaps "1.2"), if any, should be made.
+\renewcommand{\baselinestretch}{1} % this is the default line space setting
+
+% By default, each chapter will start on a recto (right-hand side) page.
+% We also force each section of the front pages to start on a recto page by inserting \cleardoublepage commands.
+% In many cases, this will require that the verso (left-hand) page be blank, and while it should be counted, a page number should not be printed.
+% The following statements ensure a page number is not printed on an otherwise blank verso page.
+\let\origdoublepage\cleardoublepage
+\newcommand{\clearemptydoublepage}{%
+  \clearpage{\pagestyle{empty}\origdoublepage}}
+\let\cleardoublepage\clearemptydoublepage
+
+% Define Glossary terms (This is properly done here, in the preamble and could also be \input{} from a separate file...)
+\input{glossaries}
+\makeglossaries
+
+\usepackage{comment}
+% cfa macros used in the document
+%\usepackage{cfalab}
+\input{common}
+\CFAStyle						% CFA code-style for all languages
+\lstset{basicstyle=\linespread{0.9}\tt}
+
+%======================================================================
+%   L O G I C A L    D O C U M E N T
+% The logical document contains the main content of your thesis.
+% Being a large document, it is a good idea to divide your thesis into several files, each one containing one chapter or other significant chunk of content, so you can easily shuffle things around later if desired.
+%======================================================================
+\begin{document}
+
+%----------------------------------------------------------------------
+% FRONT MATERIAL
+% title page,declaration, borrowers' page, abstract, acknowledgements,
+% dedication, table of contents, list of tables, list of figures, nomenclature, etc.
+%----------------------------------------------------------------------
+\input{uw-ethesis-frontpgs} 
+
+%----------------------------------------------------------------------
+% MAIN BODY
+% We suggest using a separate file for each chapter of your thesis.
+% Start each chapter file with the \chapter command.
+% Only use \documentclass or \begin{document} and \end{document} commands in this master document.
+% Tip: Putting each sentence on a new line is a way to simplify later editing.
+%----------------------------------------------------------------------
+\input{existing}
+\input{features}
+\input{unwinding}
+\input{future}
+
+%----------------------------------------------------------------------
+% END MATERIAL
+% Bibliography, Appendices, Index, etc.
+%----------------------------------------------------------------------
+
+% Bibliography
+
+% The following statement selects the style to use for references.  
+% It controls the sort order of the entries in the bibliography and also the formatting for the in-text labels.
+\bibliographystyle{plain}
+% This specifies the location of the file containing the bibliographic information.  
+% It assumes you're using BibTeX to manage your references (if not, why not?).
+\cleardoublepage % This is needed if the "book" document class is used, to place the anchor in the correct page, because the bibliography will start on its own page.
+% Use \clearpage instead if the document class uses the "oneside" argument
+\phantomsection  % With hyperref package, enables hyperlinking from the table of contents to bibliography             
+% The following statement causes the title "References" to be used for the bibliography section:
+\renewcommand*{\bibname}{References}
+
+% Add the References to the Table of Contents
+\addcontentsline{toc}{chapter}{\textbf{References}}
+
+\bibliography{uw-ethesis,pl}
+% Tip: You can create multiple .bib files to organize your references. 
+% Just list them all in the \bibliogaphy command, separated by commas (no spaces).
+
+% The following statement causes the specified references to be added to the bibliography even if they were not cited in the text. 
+% The asterisk is a wildcard that causes all entries in the bibliographic database to be included (optional).
+% \nocite{*}
+%----------------------------------------------------------------------
+
+% Appendices
+
+% The \appendix statement indicates the beginning of the appendices.
+\appendix
+% Add an un-numbered title page before the appendices and a line in the Table of Contents
+% \chapter*{APPENDICES}
+% \addcontentsline{toc}{chapter}{APPENDICES}
+% Appendices are just more chapters, with different labeling (letters instead of numbers).
+% \input{appendix-matlab_plots.tex}
+
+% GLOSSARIES (Lists of definitions, abbreviations, symbols, etc. provided by the glossaries-extra package)
+% -----------------------------
+\printglossaries
+\cleardoublepage
+\phantomsection		% allows hyperref to link to the correct page
+
+%----------------------------------------------------------------------
+\end{document} % end of logical document
Index: doc/theses/fangren_yu_COOP_F20/Report.tex
===================================================================
--- doc/theses/fangren_yu_COOP_F20/Report.tex	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/theses/fangren_yu_COOP_F20/Report.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -17,5 +17,5 @@
 \usepackage[usenames]{color}
 \input{common}                                          % common CFA document macros
-\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
 \usepackage{breakurl}
 \urlstyle{sf}
@@ -76,134 +76,143 @@
 \renewcommand{\subsectionmark}[1]{\markboth{\thesubsection\quad #1}{\thesubsection\quad #1}}
 \pagenumbering{roman}
-\linenumbers                                            % comment out to turn off line numbering
+%\linenumbers                                            % comment out to turn off line numbering
 
 \maketitle
 \pdfbookmark[1]{Contents}{section}
-\tableofcontents
-
-\clearpage
+
 \thispagestyle{plain}
 \pagenumbering{arabic}
 
 \begin{abstract}
-
-\CFA is an evolutionary extension to the C programming language, featuring a parametric type system, and is currently under active development. The reference compiler for \CFA language, @cfa-cc@, has some of its major components dated back to early 2000s, and is based on inefficient data structures and algorithms. Some improvements targeting the expression resolution algorithm, suggested by a recent prototype experiment on a simplified model, are implemented in @cfa-cc@ to support the full \CFA language. These optimizations speed up the compiler significantly by a factor of 20 across the existing \CFA codebase, bringing the compilation time of a mid-sized \CFA source file down to 10-second level. A few cases derived from realistic code examples that causes trouble to the compiler are analyzed in detail, with proposed solutions. This step of \CFA project development is critical to its eventual goal to be used alongside C for large software systems.
-
+\CFA is an evolutionary, non-object-oriented extension of the C programming language, featuring a parametric type-system, and is currently under active development. The reference compiler for the \CFA language, @cfa-cc@, has some of its major components dated back to the early 2000s, which are based on inefficient data structures and algorithms. This report introduces improvements targeting the expression resolution algorithm, suggested by a recent prototype experiment on a simplified model, which are implemented in @cfa-cc@ to support the full \CFA language. These optimizations speed up the compiler by a factor of 20 across the existing \CFA codebase, bringing the compilation time of a mid-sized \CFA source file down to the 10-second level. A few problem cases derived from realistic code examples are analyzed in detail, with proposed solutions. This work is a critical step in the \CFA project development to achieve its eventual goal of being used alongside C for large software systems.
 \end{abstract}
 
+\clearpage
+\section*{Acknowledgements}
+\begin{sloppypar}
+I would like to thank everyone in the \CFA team for their contribution towards this project. Programming language design and development is a tough subject and requires a lot of teamwork. Without the collaborative efforts from the team, this project could not have been a success. Specifically, I would like to thank Andrew Beach for introducing me to the \CFA codebase, Thierry Delisle for maintaining the test and build automation framework, Michael Brooks for providing example programs of various experimental language and type system features, and most importantly, Professor Martin Karsten for recommending me to the \CFA team, and my supervisor, Professor Peter Buhr for encouraging me to explore deeply into intricate compiler algorithms. Finally, I gratefully acknowledge the help from Aaron Moss, former graduate from the team and the author of the precedent thesis work, to participate in the \CFA team's virtual conferences and email correspondence, and provide many critical arguments and suggestions. 2020 had been an unusually challenging year for everyone and we managed to keep a steady pace.
+\end{sloppypar}
+
+\clearpage
+\tableofcontents
+
+\clearpage
 \section{Introduction}
 
-\CFA language, developed by the Programming Language Group at University of Waterloo, has a long history, with the first proof-of-concept compiler built in 2003 by Richard Bilson~\cite{Bilson03}. Many new features are added to the language over time, but the core of \CFA, parametric functions introduced by the @forall@ clause (hence the name of the language), with the type system supporting parametric overloading, remains mostly unchanged. 
-
-The current \CFA reference compiler @cfa-cc@ still includes many parts taken directly from the original Bilson's implementation, and serves as a starting point for the enhancement work to the type system. Unfortunately, it does not provide the efficiency required for the language to be used practically: a \CFA source file of approximately 1000 lines of code can take a few minutes to compile. The cause of the problem is that the old compiler used inefficient data structures and algorithms for expression resolution, which involved a lot of copying and redundant work. 
-
-This paper presents a series of optimizations to the performance-critical parts of the resolver, with a major rework of the data structure used by the compiler, using a functional programming approach to reduce memory complexity. Subsequent improvements are mostly suggested by running the compiler builds with a performance profiler against the \CFA standard library source code and a test suite to find the most underperforming components in the compiler algorithm.
-
-The \CFA team endorses a pragmatic philosophy in work that mostly focuses on practical implications of language design and implementation, rather than the theoretical limits. In particular, the compiler is designed to work on production \CFA code efficiently and keep type safety, while sometimes making compromises to expressiveness in extreme corner cases. However, when these corner cases do appear in actual usage, they need to be thoroughly investigated. Analysis presented in this paper, therefore, are conducted on a case-by-case basis. Some of them eventually point to certain weaknesses in the language design and solutions are proposed based on experimental results.
-
-\section{Completed work}
+\CFA language, developed by the Programming Language Group at the University of Waterloo, has a long history, with the initial language design in 1992 by Glen Ditchfield~\cite{Ditchfield92} and the first proof-of-concept compiler built in 2003 by Richard Bilson~\cite{Bilson03}. Many new features have been added to the language over time, but the core of \CFA's type-system --- parametric functions introduced by the @forall@ clause (hence the name of the language) providing parametric overloading --- remains mostly unchanged.
+
+The current \CFA reference compiler, @cfa-cc@, is designed using the visitor pattern~\cite{vistorpattern} over an abstract syntax tree (AST), where multiple passes over the AST modify it for subsequent passes. @cfa-cc@ still includes many parts taken directly from the original Bilson implementation, which served as the starting point for this enhancement work to the type system. Unfortunately, the prior implementation did not provide the efficiency required for the language to be practical: a \CFA source file of approximately 1000 lines of code can take a multiple minutes to compile. The cause of the problem is that the old compiler used inefficient data structures and algorithms for expression resolution, which involved significant copying and redundant work.
+
+This report presents a series of optimizations to the performance-critical parts of the resolver, with a major rework of the compiler data-structures using a functional-programming approach to reduce memory complexity. The improvements were suggested by running the compiler builds with a performance profiler against the \CFA standard-library source-code and a test suite to find the most underperforming components in the compiler algorithm.
+
+The \CFA team endorses a pragmatic philosophy that focuses on practical implications of language design and implementation rather than theoretical limits. In particular, the compiler is designed to be expressive with respect to code reuse while maintaining type safety, but compromise theoretical soundness in extreme corner cases. However, when these corner cases do appear in actual usage, they need to be thoroughly investigated. A case-by-case analysis is presented for several of these corner cases, some of which point to certain weaknesses in the language design with solutions proposed based on experimental results.
+
+\section{AST restructuring}
 
 \subsection{Memory model with sharing}
 
-A major rework of the abstract syntax tree (AST) data structure in the compiler is completed as the first step of the project. The majority of work were documented in the reference manual of the compiler~\cite{cfa-cc}. To summarize:
-\begin{itemize}
-\item
-AST nodes (and therefore subtrees) can be shared without copying when reused.
-\item
-Modifications apply the functional programming principle, making copies for local changes without affecting the original data shared by other owners. In-place mutations are permitted as a special case when sharing does not happen. The logic is implemented by reference counting.
-\item
-Memory allocation and freeing are performed automatically using smart pointers.
-\end{itemize}
-The resolver algorithm designed for overload resolution naturally introduces a significant amount of reused intermediate representations, especially in the following two places:
-\begin{itemize}
-\item
-Function overload candidates are computed by combining the argument candidates bottom-up, with many of them being a common term. For example, if $n$ overloads of a function @f@ all take an integer for the first parameter but different types for the second (@f( int, int )@, @f( int, double )@, etc.) the first term is reused $n$ times for each of the generated candidate expressions. This effect is particularly bad for deep expression trees. 
-\item
-In the unification algorithm and candidate elimination step, actual types are obtained by substituting the type parameters by their bindings. Let $n$ be the complexity (\ie number of nodes in representation) of the original type, $m$ be the complexity of bound type for parameters, and $k$ be the number of occurrences of type parameters in the original type. If everything needs to be deep-copied, the substitution step takes $O(n+mk)$ time and memory, while using shared nodes it is reduced to $O(n)$ time and $O(k)$ memory.
-\end{itemize}
-One of the worst examples for the old compiler is a long chain of I/O operations
-\begin{cfa}
-sout | 1 | 2 | 3 | 4 | ...
-\end{cfa}
-The pipe operator is overloaded by \CFA I/O library for every primitive type in C language, as well as I/O manipulators defined by the library. In total there are around 50 overloads for the output stream operation. On resolving the $n$-th pipe operator in the sequence, the first term, which is the result of sub-expression containing $n-1$ pipe operators, is reused to resolve every overload. Therefore at least $O(n^2)$ copies of expression nodes are made during resolution, not even counting type unification cost; combined with two large factors from number of overloads of pipe operators, and that the ``output stream type'' in \CFA is a trait with 27 assertions (which adds to complexity of the pipe operator's type) this makes compiling a long output sequence extremely slow. In new AST representation only $O(n)$ copies are required and type of pipe operator is not copied at all.
-
-Reduction in space complexity is especially important, as preliminary profiling result on the old compiler build shows that over half of time spent in expression resolution are on memory allocations. 
- 
+A major rework of the AST data-structure in the compiler was completed as the first step of the project. The majority of this work is documented in my prior report documenting the compiler reference-manual~\cite{cfa-cc}. To summarize:
+\begin{itemize}
+\item
+AST nodes (and therefore subtrees) can be shared without copying.
+\item
+Modifications are performed using functional-programming principles, making copies for local changes without affecting the original data shared by other owners. In-place mutations are permitted as a special case when there is no sharing. The logic is implemented by reference counting.
+\item
+Memory allocation and freeing are performed automatically using smart pointers~\cite{smartpointers}.
+\end{itemize}
+
+The resolver algorithm, designed for overload resolution, uses a significant amount of reused, and hence copying, for the intermediate representations, especially in the following two places:
+\begin{itemize}
+\item
+Function overload candidates are computed by combining the argument candidates bottom-up, with many being a common term. For example, if $n$ overloads of a function @f@ all take an integer for the first parameter but different types for the second, \eg @f( int, int )@, @f( int, double )@, etc., the first term is copied $n$ times for each of the generated candidate expressions. This copying is particularly bad for deep expression trees.
+\item
+In the unification algorithm and candidate elimination step, actual types are obtained by substituting the type parameters by their bindings. Let $n$ be the complexity (\ie number of nodes in representation) of the original type, $m$ be the complexity of the bound type for parameters, and $k$ be the number of occurrences of type parameters in the original type. If every substitution needs to be deep-copied, these copy step takes $O(n+mk)$ time and memory, while using shared nodes it is reduced to $O(n)$ time and $O(k)$ memory.
+\end{itemize}
+One of the worst examples for the old compiler is a long chain of I/O operations:
+\begin{cfa}
+sout | 1 | 2 | 3 | 4 | ...;   // print integer constants
+\end{cfa}
+The pipe operator is overloaded by the \CFA I/O library for every primitive type in the C language, as well as I/O manipulators defined by the library. In total, there are around 50 overloads for the output stream operation. On resolving the $n$-th pipe operator in the sequence, the first term, which is the result of sub-expression containing $n-1$ pipe operators, is reused to resolve every overload. Therefore at least $O(n^2)$ copies of expression nodes are made during resolution, not even counting type unification cost; combined with the two large factors from number of overloads of pipe operators, and that the ``output stream type'' in \CFA is a trait with 27 assertions (which adds to complexity of the pipe operator's type) this makes compiling a long output sequence extremely slow. In the new AST representation, only $O(n)$ copies are required and the type of the pipe operator is not copied at all.
+Reduction in space complexity is especially important, as preliminary profiling results on the old compiler build showed over half of the time spent in expression resolution is on memory allocations.
+
+Since the compiler codebase is large and the new memory model mostly benefits expression resolution, some of the old data structures are still kept, and a conversion pass happens before and after the general resolve phase. Rewriting every compiler module will take longer, and whether the new model is correct was unknown when this project started, therefore only the resolver is currently implemented with the new data structure.
+
 
 \subsection{Merged resolver calls}
 
-The pre-resolve phase of compilation, inadequately called ``validate'' in the compiler source code, does more than just simple syntax validation, as it also normalizes input program. Some of them, however, requires type information on expressions and therefore needs to call the resolver before the general resolve phase. There are three notable places where the resolver is invoked:
-\begin{itemize}
-\item
-Attempt to generate default constructor, copy constructor and destructor for user-defined @struct@ types
-\item
-Resolve @with@ statements (the same as in Python, which introduces fields of a structure directly in scope)
+The pre-resolve phase of compilation, inappropriately called ``validate'' in the compiler source code, has a number of passes that do more than simple syntax and semantic validation; some passes also normalizes the input program. A few of these passes require type information for expressions, and therefore, need to call the resolver before the general resolve phase. There are three notable places where the resolver is invoked:
+\begin{itemize}
+\item
+Generate default constructor, copy constructor and destructor for user-defined @struct@ types.
+\item
+Resolve @with@ statements (the same as in Pascal~\cite{pascal}), which introduces fields of a structure directly into a scope.
 \item
 Resolve @typeof@ expressions (cf. @decltype@ in \CC); note that this step may depend on symbols introduced by @with@ statements.
 \end{itemize}
-Since the compiler codebase is large and the new memory model mostly only benefits expression resolution, the old data structure is still kept, and a conversion pass happens before and after resolve phase. Rewriting every compiler module will take a long time, and whether the new model is correct is still unknown when started, therefore only the resolver is implemented with the new data structure. 
-
-Since the constructor calls were one of the most expensive to resolve (reason will be shown in the next section), pre-resolve phase were taking more time after resolver moves to the more efficient new implementation. To better facilitate the new resolver, every step that requires type information are reintegrated as part of resolver.
-
-A by-product of this work is that the reversed dependence of @with@ statement and @typeof@ can now be handled. Previously, the compiler is unable to handle cases such as
+
+Since the constructor calls are one of the most expensive to resolve (reason given in~\VRef{s:SpecialFunctionLookup}), this pre-resolve phase was taking a large amount of time even after the resolver was changed to the more efficient new implementation. The problem is that multiple resolutions repeat a significant amount of work. Therefore, to better facilitate the new resolver, every step that requires type information should be integrated as part of the general resolver phase.
+
+A by-product of this work is that reversed dependence between @with@ statement and @typeof@ can now be handled. Previously, the compiler was unable to handle cases such as:
 \begin{cfa}
 struct S { int x; };
 S foo();
 typeof( foo() ) s; // type is S
-with (s) { 
+with (s) {
 	x; // refers to s.x
 }
 \end{cfa}
-since type of @s@ is still unresolved when handling @with@ expressions. Instead, the new (and correct) approach is to evaluate @typeof@ expressions when the declaration is first seen, and it suffices because of the declaration-before-use rule.
+since the type of @s@ is unresolved when handling @with@ expressions because the @with@ pass follows the @typeof@ pass (interchanging passes only interchanges the problem). Instead, the new (and correct) approach is to evaluate @typeof@ expressions when the declaration is first seen during resolution, and it suffices because of the declaration-before-use rule.
 
 
 \subsection{Special function lookup}
-
-Reducing the number of functions looked up for overload resolution is an effective way to gain performance when there are many overloads but most of them are trivially wrong. In practice, most functions have few (if any) overloads but there are notable exceptions. Most importantly, constructor @?{}@, destructor @^?{}@, and assignment @?=?@ are generated for every user-defined type, and in a large source file there can be hundreds of them. Furthermore, many calls to them are generated for initializing variables and passing arguments. This fact makes them the most overloaded and most called functions.
-
-In an object-oriented programming language, object has methods declared with their types, so a call such as @obj.f()@ only needs to perform lookup in the method table corresponding to type of @obj@. \CFA on the other hand, does not have methods, and all types are open (\ie new operations can be defined on them), so a similar approach will not work in general. However, the ``big 3'' operators have a unique property enforced by the language rules, such that the first parameter must have a reference type. Since \CFA does not have class inheritance, reference type must always match exactly. Therefore, argument-dependent lookup can be implemented for these operators, by using a dedicated symbol table.
-
-The lookup key used for the special functions is the mangled type name of the first parameter, which acts as the @this@ parameter in an object-oriented language. To handle generic types, the type parameters are stripped off, and only the base type is matched. Note that a constructor (destructor, assignment operator) taking arbitrary @this@ argument, for example @forall( dtype T ) void ?{}( T & );@ is not allowed, and it guarantees that if the @this@ type is known, all possible overloads can be found by searching with the given type. In case that the @this@ argument itself is overloaded, it is resolved first and all possible result types are used for lookup.
-
-Note that for the generated expressions, the particular variable for @this@ argument is fully known, without overloads, so the majority of constructor call resolutions only need to check for one given object type. Explicit constructor calls and assignment statements sometimes may require lookup for multiple types. In the extremely rare case that type of @this@ argument is yet unbound, everything will have to be checked, just like without the argument-dependent lookup algorithm; fortunately, this case almost never happens in practice. An example is found in the library function @new@:
+\label{s:SpecialFunctionLookup}
+
+Reducing the number of function looked ups for overload resolution is an effective way to gain performance when there are many overloads but most of them are trivially wrong. In practice, most functions have few (if any) overloads but there are notable exceptions. Most importantly, constructor @?{}@, destructor @^?{}@, and assignment @?=?@ are generated for every user-defined type (@struct@ and @union@ in C), and in a large source file there can be hundreds of them. Furthermore, many calls are generated for initializing variables, passing arguments and copying values. This fact makes them the most overloaded and most called functions.
+
+In an object-oriented programming language, the object-method types are scoped within a class, so a call such as @obj.f()@ only needs to perform lookup in the method table corresponding to the type of @obj@. \CFA on the other hand, does not have methods, and all types are open, \ie new operations can be defined on them without inheritance; at best a \CFA type can be constrained by a translation unit. However, the ``big 3'' operators have a unique property enforced by the language rules: the first parameter must be a reference to its associated type, which acts as the @this@ parameter in an object-oriented language. Since \CFA does not have class inheritance, the reference type must always match exactly. Therefore, argument-dependent lookup can be implemented for these operators by using a dedicated, fast symbol-table.
+
+The lookup key for the special functions is the mangled type name of the first parameter. To handle generic types, the type parameters are stripped off, and only the base type is matched. Note a constructor (destructor, assignment operator) may not take an arbitrary @this@ argument, \eg @forall( dtype T ) void ?{}( T & )@, thus guaranteeing that if the @this@ type is known, all possible overloads can be found by searching with this given type. In the case where the @this@ argument itself is overloaded, it is resolved first and all possible result types are used for lookup.
+
+Note that for a generated expression, the particular variable for the @this@ argument is fully known, without overloads, so the majority of constructor-call resolutions only need to check for one given object type. Explicit constructor calls and assignment statements sometimes require lookup for multiple types. In the extremely rare case that the @this@-argument type is unbound, all necessary types are guaranteed to be checked, as for the previous lookup without the argument-dependent lookup; fortunately, this complex case almost never happens in practice. An example is found in the library function @new@:
 \begin{cfa}
 forall( dtype T | sized( T ), ttype TT | { void ?{}( T &, TT ); } )
 T * new( TT p ) { return &(*malloc()){ p }; }
 \end{cfa}
-as @malloc@ may return a pointer to any type, depending on context. 
-
-Interestingly, this particular line of code actually caused another complicated issue, where the unusually massive work of checking every constructor in presence makes the case even worse. Section~\ref{s:TtypeResolutionInfiniteRecursion} presents a detailed analysis for the problem.
-
-The ``callable'' operator @?()@ (cf. @operator()@ in \CC) could also be included in the special operator list, as it is usually only on user-defined types, and the restriction that first argument must be a reference seems reasonable in this case. 
+as @malloc@ may return a pointer to any type, depending on context.
+
+Interestingly, this particular declaration actually causes another complicated issue, making the complex checking of every constructor even worse. \VRef[Section]{s:TtypeResolutionInfiniteRecursion} presents a detailed analysis of this problem.
+
+The ``callable'' operator @?()@ (cf. @operator()@ in \CC) can also be included in this special operator list, as it is usually only on user-defined types, and the restriction that the first argument must be a reference seems reasonable in this case.
 
 
 \subsection{Improvement of function type representation}
 
-Since substituting type parameters with their bound types is one fundamental operation in many parts of resolver algorithm (particularly unification and environment binding), making as few copies of type nodes as possible helps reducing memory complexity. Even with the new memory management model, allocation is still a significant factor of resolver performance. Conceptually, operations on type nodes of AST should be performed in functional programming style, treating the data structure as immutable and only copy when necessary. The in-place mutation is a mere optimization that does not change logic of operations.
-The model was broken on function types by an inappropriate design. Function types require some special treatment due to the existence of assertions. In particular, it must be able to distinguish two different kinds of type parameter usage:
+Since substituting type parameters with their bound types is one fundamental operation in many parts of resolver algorithm (particularly unification and environment binding), making as few copies of type nodes as possible helps reducing memory complexity. Even with the new memory management model, allocation is still a significant factor of resolver performance. Conceptually, operations on type nodes of the AST should be performed in functional-programming style, treating the data structure as immutable and only copying when necessary. The in-place mutation is a mere optimization that does not change the logic for operations.
+
+However, the model was broken for function types by an inappropriate design. Function types require special treatment due to the existence of assertions that constrain the types it supports. Specifically, it must be possible to distinguish two different kinds of type parameter usage:
 \begin{cfa}
 forall( dtype T ) void foo( T * t ) {
-	forall( dtype U ) void bar( T * t, U * u ) { ... }
-}
-\end{cfa}
-Here, only @U@ is a free parameter in declaration of @bar@, as it appears in the function's own forall clause; while @T@ is not free. 
-
-Moreover, the resolution algorithm also has to distinguish type bindings of multiple calls to the same function, for example with
+	forall( dtype U ) void bar( @T@ * t, @U@ * u ) { ... }
+}
+\end{cfa}
+Here, only @U@ is a free parameter in the nested declaration of function @bar@, as @T@ must be bound at the call site when resolving @bar@.
+
+Moreover, the resolution algorithm also has to distinguish type bindings of multiple calls to the same function, \eg:
 \begin{cfa}
 forall( dtype T ) int foo( T x );
-foo( foo( 1.0 ) );
-\end{cfa}
-The inner call has binding (T: double) while the outer call has binding (T: int). Therefore a unique representation of free parameters in each expression is required. This was previously done by creating a copy of the parameter declarations inside function type, and fixing references afterwards. However, fixing references is an inherently deep operation that does not work well with functional programming model, as it must be evaluated eagerly on the entire syntax tree representing the function type.
-
-The revised approach generates a unique ID value for each function call expression instance and represents an occurrence of free parameter type with a pair of generated ID and the original parameter declaration, so that references do not need to be fixed, and a shallow copy of function type is possible. 
-
-Note that after the change, all declaration nodes in syntax tree representation maps one-to-one with the actual declarations in the program, and therefore are guaranteed to be unique. Such property can potentially enable more optimizations, and some related ideas are presented after Section~\ref{s:SharedSub-ExpressionCaseUniqueExpressions}.
+int i = foo( foo( 1.0 ) );
+\end{cfa}
+The inner call has binding (T: double) while the outer call has binding (T: int). Therefore a unique representation for the free parameters is required in each expression. This type binding was previously done by creating a copy of the parameter declarations inside the function type and fixing references afterwards. However, fixing references is an inherently deep operation that does not work well with the functional-programming style, as it forces eager evaluation on the entire syntax tree representing the function type.
+
+The revised approach generates a unique ID value for each function call expression instance and represents an occurrence of a free-parameter type with a pair of generated ID and original parameter declaration, so references are unique and a shallow copy of the function type is possible.
+
+Note that after the change, all declaration nodes in the syntax-tree representation now map one-to-one with the actual declarations in the program, and therefore are guaranteed to be unique. This property can potentially enable more optimizations, and some related ideas are presented at the end of \VRef{s:SharedSub-ExpressionCaseUniqueExpressions}.
 
 
 \subsection{Improvement of pruning steps}
 
-A minor improvement for candidate elimination is to skip the step on the function overloads themselves and only perform on results of function application. As function calls are usually by name, the name resolution rule dictates that every function candidate necessarily has a different type; indirect function calls are rare, and when they do appear, they usually will not have many possible interpretations, and those rarely matches exactly in argument type. Since function types have a much more complex representation than data types (with multiple parameters and assertions), checking equality on them also takes longer. 
-
-A brief test of this approach shows that the number of function overloads considered in expression resolution increases by a negligible amount of less than 1 percent, while type comparisons in candidate elimination are cut by more than half. Improvement is consistent over all \CFA source files in the test suite.
+A minor improvement for candidate elimination is to skip the step on the function overloads and only check the results of function application. As function calls are usually by name (versus pointers to functions), the name resolution rule dictates that every function candidate necessarily has a different type; indirect function calls are rare, and when they do appear, there are even fewer cases with multiple interpretations, and these rarely match exactly in argument type. Since function types have a much more complex representation (with multiple parameters and assertions) than data types, checking equality on them also takes longer.
+
+A brief test of this approach shows that the number of function overloads considered in expression resolution increases by an amount of less than 1 percent, while type comparisons in candidate elimination are reduced by more than half. This improvement is consistent over all \CFA source files in the test suite.
 
 
@@ -211,5 +220,5 @@
 \label{s:SharedSub-ExpressionCaseUniqueExpressions}
 
-Unique expression denotes an expression that must be evaluated only once, to prevent unwanted side effects. It is currently only a compiler artifact, generated on tuple member expression of the form
+Unique expression denotes an expression evaluated only once to prevent unwanted side effects. It is currently only a compiler artifact, generated for tuple-member expression of the form:
 \begin{cfa}
 struct S { int a; int b; };
@@ -217,8 +226,8 @@
 s.[a, b]; // tuple member expression, type is [int, int]
 \end{cfa}
-If the aggregate expression contains function calls, it cannot be evaluated multiple times:
+If the aggregate expression is function call, it cannot be evaluated multiple times:
 \begin{cfa}
 S makeS();
-makeS().[a, b]; // this should only make one S
+makeS().[a, b]; // this should only generate a unique S
 \end{cfa}
 Before code generation, the above expression is internally represented as
@@ -237,20 +246,25 @@
 \end{cfa}
 at code generation, where @_unique_var@ and @_unique_var_evaluated@ are generated variables whose scope covers all appearances of the same expression.
-
-Note that although the unique expression is only used for tuple expansion now, it is a generally useful construction, and can be seen in other languages, such as Scala's @lazy val@~\cite{Scala}; therefore it could be worthwhile to introduce the unique expression to a broader context in \CFA and even make it directly available to programmers.
-
-In the compiler's visitor pattern, however, this creates a problem where multiple paths to a logically unique expression exist, so it may be modified more than once and become ill-formed; some specific intervention is required to ensure that unique expressions are only visited once. Furthermore, a unique expression appearing in more than one places will be copied on mutation so its representation is no longer unique. Some hacks are required to keep it in sync, and the methods are different when mutating the unique expression instance itself or its underlying expression.
-
-Example when mutating the underlying expression (visit-once guard)
+The conditional check ensures a single call to @makeS()@ even though there are logically multiple calls because of the tuple field expansion.
+
+Note that although the unique expression is only used for tuple expansion now, it is a generally useful construction, and is seen in other programming languages, such as Scala's @lazy val@~\cite{Scala}; therefore it may be worthwhile to introduce the unique expression to a broader context in \CFA and even make it directly available to programmers.
+
+In the compiler's visitor pattern, however, this creates a problem where multiple paths to a logically unique expression exist, so it may be modified more than once and become ill-formed; some specific intervention is required to ensure unique expressions are only visited once. Furthermore, a unique expression appearing in more than one places is copied on mutation so its representation is no longer unique.
+
+Currently, special cases are required to keep everything synchronized, and the methods are different when mutating the unique expression instance itself or its underlying expression:
+\begin{itemize}
+\item
+When mutating the underlying expression (visit-once guard)
 \begin{cfa}
 void InsertImplicitCalls::previsit( const ast::UniqueExpr * unqExpr ) {
-	if ( visitedIds.count( unqExpr->id ) ) visit_children = false;
+	@if ( visitedIds.count( unqExpr->id ) ) visit_children = false;@
 	else visitedIds.insert( unqExpr->id );
 }
 \end{cfa}
-Example when mutating the unique instance itself, which actually creates copies
+\item
+When mutating the unique instance itself, which actually creates copies
 \begin{cfa}
 auto mutExpr = mutate( unqExpr ); // internally calls copy when shared
-if ( ! unqMap.count( unqExpr->id ) ) {
+@if ( ! unqMap.count( unqExpr->id ) ) {@
 	...
 } else {
@@ -259,14 +273,15 @@
 }
 \end{cfa}
-Such workaround seems difficult to be fit into a common visitor template. This suggests the memory model may need different kinds of nodes to accurately represent the syntax tree.
-
-Together with the fact that declaration nodes are always unique, it is possible that AST nodes can be classified by three different types:
-\begin{itemize}
-\item
-\textbf{Strictly unique} with only one owner (declarations);
-\item
-\textbf{Logically unique} with (possibly) many owners but should not be copied (unique expression example presented here);
-\item
-\textbf{Shared} by functional programming model, which assume immutable data structure and are copied on mutation.
+\end{itemize}
+Such workarounds are difficult to fit into the common visitor pattern, which suggests the memory model may need different kinds of nodes to accurately represent this feature in the AST.
+
+Given that declaration nodes are unique, it is possible for AST nodes to be divided into three different types:
+\begin{itemize}
+\item
+\textbf{Singleton} with only one owner (declarations);
+\item
+\textbf{No-copy} with multiple owners but cannot be copied (unique expression example presented here);
+\item
+\textbf{Copy} by functional-programming style, which assumes immutable data structures that are copied on mutation.
 \end{itemize}
 The boilerplate code can potentially handle these three cases differently.
@@ -275,5 +290,5 @@
 \section{Analysis of resolver algorithm complexity}
 
-The focus of this chapter is to identify and analyze some realistic cases that cause resolver algorithm to have an exponential run time. As previous work has shown [3], the overload resolution problem in \CFA has worst-case exponential complexity; however, only few specific patterns can trigger the exponential complexity in practice. Implementing heuristic-based optimization for those selected cases is helpful to alleviate the problem.
+The focus of this section is to identify and analyze some realistic cases that cause the resolver algorithm to have an exponential runtime. As previous work has shown~\cite[\S~4.2.1]{Moss19}, the overload resolution problem in \CFA has worst-case exponential complexity; however, only few specific patterns can trigger the exponential complexity in practice. Implementing heuristic-based optimization for those selected cases is helpful to alleviate the problem.
 
 
@@ -281,19 +296,20 @@
 \label{s:UnboundReturnType}
 
-The interaction of return type overloading and polymorphic functions creates this problem of function calls with unbound return type, and is further complicated by the presence of assertions.
+The interaction of return-type overloading and polymorphic functions creates function calls with unbounded return-type, and is further complicated by the presence of assertions.
 The prime example of a function with unbound return type is the type-safe version of C @malloc@:
 \begin{cfa}
-// size deduced from type, so no need to provide the size argument
-forall( dtype T | sized( T ) ) T * malloc( void ); 
-\end{cfa}
-Unbound return type can be problematic in resolver algorithm complexity because a single match of function call with unbound return type may create multiple candidates. In the worst case, consider a function declared to return any @otype@:
+forall( dtype T | sized( T ) )
+T * malloc( void ) { return (T *)malloc( sizeof(T) ); } // call C malloc
+int * i = malloc();  // type deduced from left-hand size $\Rightarrow$ no size argument or return cast
+\end{cfa}
+An unbound return-type is problematic in resolver complexity because a single match of a function call with an unbound return type may create multiple candidates. In the worst case, consider a function declared that returns any @otype@ (defined \VPageref{otype}):
 \begin{cfa}
 forall( otype T ) T anyObj( void );
 \end{cfa}
-As the resolver attempts to satisfy the otype constraint on @T@, a single call to @anyObj()@ without the result type known creates at least as many candidates as the number of complete types currently in scope; with generic types it becomes even worse, for example, assuming a declaration of generic pair is available at that point:
+As the resolver attempts to satisfy the otype constraint on @T@, a call to @anyObj()@ in an expression, without the result type known, creates at least as many candidates as the number of complete types currently in scope; with generic types it becomes even worse, \eg assuming a declaration of a generic @pair@ is available at that point:
 \begin{cfa}
 forall( otype T, otype U ) struct pair { T first; U second; };
 \end{cfa}
-Then an @anyObj()@ call can result in arbitrarily complex types, such as @pair( pair( int,int ), pair( int,int ) )@, and the depth can grow indefinitely until the specified parameter depth limit, thus creating exponentially many candidates. However, the expected types allowed by parent expressions are practically very few, so most of those interpretations are invalid; if the result type is never bound up to top level, by the semantic rules it is ambiguous if there are more than one valid bindings, and resolution can fail fast. It is therefore reasonable to delay resolving assertions on an unbound parameter in return type; however, with the current cost model, such behavior may further cause irregularities in candidate selection, such that the presence of assertions can change the preferred candidate, even when order of expression costs are supposed to stay the same. Detailed analysis of this issue will be presented later, in the correctness part.
+Then an @anyObj()@ call can result in arbitrarily complex types, such as @pair( pair( int, int ), pair( int, int ) )@, and the depth can grow indefinitely until a specified parameter-depth limit, thus creating exponentially many candidates. However, the expected types allowed by parent expressions are practically very few, so most of those interpretations are invalid; if the result type is never bound up to the top level, by the semantic rules it is ambiguous if there is more than one valid binding and resolution fails quickly. It is therefore reasonable to delay resolving assertions on an unbound parameter in a return type; however, with the current cost model, such behavior may further cause irregularities in candidate selection, such that the presence of assertions can change the preferred candidate, even when order of expression costs are supposed to stay the same. A detailed analysis of this issue is presented in \VRef{s:AnalysisTypeSystemCorrectness}.
 
 
@@ -301,5 +317,5 @@
 \label{s:TtypeResolutionInfiniteRecursion}
 
-@ttype@ (``tuple type'') is a relatively new addition to the language that attempts to provide type-safe variadic argument semantics. Unlike regular @dtype@ parameters, @ttype@ is only valid in function parameter list, and may only appear once as the type of last parameter. At the call site, a @ttype@ parameter is bound to the tuple type of all remaining function call arguments.
+@ttype@ (``tuple type'') is a relatively new addition to the language that attempts to provide type-safe variadic argument semantics. Unlike regular @dtype@ parameters, @ttype@ is only valid in a function parameter-list, and may only appear once as the last parameter type. At the call site, a @ttype@ parameter is bound to the tuple type of all remaining function-call arguments.
 
 There are two kinds of idiomatic @ttype@ usage: one is to provide flexible argument forwarding, similar to the variadic template in \CC (\lstinline[language=C++]|template<typename... args>|), as shown below in the implementation of @unique_ptr@
@@ -309,54 +325,46 @@
 	T * data;
 };
-forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); })
-void ?{}( unique_ptr( T ) & this, Args args ) {
-	this.data = new( args );
-}
-\end{cfa}
-the other is to implement structural recursion in the first-rest manner:
-\begin{cfa}
-forall( otype T, ttype Params | { void process( T ); void func( Params ); })
+forall( dtype T | sized( T ), @ttype Args@ | { void ?{}( T &, Args ); })
+void ?{}( unique_ptr( T ) & this, Args @args@ ) {
+	this.data = new( @args@ );  // forward constructor arguments to dynamic allocator
+}
+\end{cfa}
+The other usage is to implement structural recursion in the first-rest pattern:
+\begin{cfa}
+forall( otype T, @ttype Params@ | { void process( T ); void func( Params ); })
 void func( T arg1, Params p ) {
 	process( arg1 );
-	func( p );
-}
-\end{cfa}
-For the second use case, it is important that the number of parameters in the recursive call go down, since the call site must deduce all assertion candidates, and that is only possible if by just looking at argument types (and not their values), the recursion is known to be completed in a finite number of steps.
-
-In recent experiments, however, some flaw in the type binding rules can lead to the first kind of @ttype@ use case produce an invalid candidate that the resolver enters an infinite loop.
-
-This bug was discovered in an attempt to raise assertion recursive depth limit and one of the library program takes exponentially longer time to compile. The cause of the problem is identified to be the following set of functions.
-File @memory.cfa@ contains
-\begin{cfa}
-#include "memory.hfa"
-#include "stdlib.hfa"
-\end{cfa}
-where file @memory.hfa@ contains the @unique_ptr@ declaration above, and two other similar functions with @ttype@ parameter:
-\begin{cfa}
-forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); }) {
+	func( @p@ );  // recursive call until base case of one argument
+}
+\end{cfa}
+For the second use case, it is imperative the number of parameters in the recursive call goes down, since the call site must deduce all assertion candidates, and that is only possible if by observation of the argument types (and not their values), the recursion is known to be completed in a finite number of steps.
+
+In recent experiments, however, a flaw in the type-binding rules can lead to the first kind of @ttype@ use case producing an invalid candidate and the resolver enters an infinite loop.
+This bug was discovered in an attempt to raise the assertion recursive-depth limit and one of the library programs took exponentially longer to compile. The cause of the problem is the following set of functions:
+\begin{cfa}
+// unique_ptr  declaration from above
+
+forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); } ) { // distribute forall clause
 	void ?{}( counter_data( T ) & this, Args args );
 	void ?{}( counter_ptr( T ) & this, Args args );
 	void ?{}( unique_ptr( T ) & this, Args args );
 }
-\end{cfa}
-File @stdlib.hfa@ contains
-\begin{cfa}
+
 forall( dtype T | sized( T ), ttype TT | { void ?{}( T &, TT ); } )
-T * new( TT p ) { return &(*malloc()){ p }; }
-\end{cfa}
-
-In the expression @(*malloc()){p}@, the type of object being constructed is yet unknown, since the return type information is not immediately provided. That caused every constructor to be searched, and while normally a bound @ttype@ cannot be unified with any free parameter, it is possible with another free @ttype@. Therefore in addition to the correct option provided by assertion, 3 wrong options are examined, each of which again requires the same assertion, for an unknown base type T and @ttype@ arguments, and that becomes an infinite loop, until the specified recursion limit and resolution is forced to fail. Moreover, during the recursion steps, number of candidates grows exponentially, since there are always 3 options at each step.
-
-Unfortunately, @ttype@ to @ttype@ binding is necessary, to allow calling the function provided by assertion indirectly.
-\begin{cfa}
-forall( dtype T | sized( T ), ttype Args | { void ?{}( T &, Args ); })
-void ?{}( unique_ptr( T ) & this, Args args ) { this.data = (T * )new( args ); }
-\end{cfa}
-Here the constructor assertion is used for the @new( args )@ call.
+T * new( TT p ) { return @&(*malloc()){ p };@ }
+\end{cfa}
+In the expression @(*malloc()){p}@, the type of the object being constructed is unknown, since the return-type information is not immediately available. That causes every constructor to be searched, and while normally a bound @ttype@ cannot be unified with any free parameter, it is possible with another free @ttype@. Therefore, in addition to the correct option provided by the assertion, 3 wrong options are examined, each of which again requires the same assertion, for an unknown base-type @T@ and @ttype@ argument, which becomes an infinite loop until the specified recursion limit and resolution is fails. Moreover, during the recursion steps, the number of candidates grows exponentially, since there are always 3 options at each step.
+
+Unfortunately, @ttype@ to @ttype@ binding is necessary, to allow indirectly calling a function provided in an assertion.
+\begin{cfa}
+forall( dtype T | sized( T ), ttype Args | { @void ?{}( T &, Args );@ })
+void ?{}( unique_ptr( T ) & this, Args args ) { this.data = (T *)@new( args )@; } // constructor call
+\end{cfa}
+Here the constructor assertion is used by the @new( args )@ call to indirectly call the constructor on the allocated storage.
 Therefore, it is hard, perhaps impossible, to solve this problem by tweaking the type binding rules. An assertion caching algorithm can help improve this case by detecting cycles in recursion.
 
-Meanwhile, without the caching algorithm implemented, some changes in the \CFA source code are enough to eliminate this problem, at least in the current codebase. Note that the issue only happens with an overloaded variadic function, which rarely appears in practice, since the idiomatic use cases are for argument forwarding and self-recursion. The only overloaded @ttype@ function so far discovered in all of \CFA standard library code is the constructor, and by utilizing the argument-dependent lookup process described in Section~\ref{s:UnboundReturnType}, adding a cast before constructor call gets rid of the issue. 
-\begin{cfa}
-T * new( TT p ) { return &(*(T * )malloc()){ p }; }
+Meanwhile, without a caching algorithm implemented, some changes in the \CFA source code are enough to eliminate this problem, at least in the current codebase. Note that the issue only happens with an overloaded variadic function, which rarely appears in practice, since the idiomatic use cases are for argument forwarding and self-recursion. The only overloaded @ttype@ function so far discovered in all of \CFA standard library is the constructor, and by utilizing the argument-dependent lookup process described in \VRef{s:UnboundReturnType}, adding a cast before the constructor call removes the issue.
+\begin{cfa}
+T * new( TT p ) { return &(*@(T * )@malloc()){ p }; }
 \end{cfa}
 
@@ -364,5 +372,5 @@
 \subsection{Reused assertions in nested generic type}
 
-The following test of deeply nested dynamic generic type reveals that locally caching reused assertions is necessary, rather than just a resolver optimization, because recomputing assertions can result in bloated generated code size:
+The following test of deeply nested, dynamic generic type reveals that locally caching reused assertions is necessary, rather than just a resolver optimization, because recomputing assertions can result in bloated generated code size:
 \begin{cfa}
 struct nil {};
@@ -372,20 +380,20 @@
 int main() {
 	#if   N==0
-	nil x;   
+	nil @x@;
 	#elif N==1
-	cons( size_t, nil ) x; 
+	cons( size_t, nil ) @x@;
 	#elif N==2
-	cons( size_t, cons( size_t, nil ) ) x;
+	cons( size_t, cons( size_t, nil ) ) @x@;
 	#elif N==3
-	cons( size_t, cons( size_t, cons( size_t, nil ) ) ) x;
+	cons( size_t, cons( size_t, cons( size_t, nil ) ) ) @x@;
 	// similarly for N=4,5,6
 	#endif
 }
 \end{cfa}
-At the declaration of @x@, it is implicitly initialized by generated constructor call, whose signature is given by
+At the declaration of @x@, it is implicitly initialized by generated constructor call, with signature:
 \begin{cfa}
 forall( otype L, otype R ) void ?{}( cons( L, R ) & );
 \end{cfa}
-Note that the @otype@ constraint contains 4 assertions:
+where the @otype@ constraint contains the 4 assertions:\label{otype}
 \begin{cfa}
 void ?{}( L & ); // default constructor
@@ -394,8 +402,9 @@
 L & ?=?( L &, L & ); // assignment
 \end{cfa}
-Now since the right hand side of outermost cons is again a cons, recursive assertions are required. When the compiler cannot cache and reuse already resolved assertions, it becomes a problem, as each of those 4 pending assertions again asks for 4 more assertions one level below. Without any caching, number of resolved assertions grows exponentially, while that is obviously unnecessary since there are only $n+1$ different types involved. Even worse, this causes exponentially many wrapper functions generated later at the codegen step, and results in huge compiled binary.
-
-\begin{table}[h]
+
+\begin{table}[htb]
+\centering
 \caption{Compilation results of nested cons test}
+\label{t:NestedConsTest}
 \begin{tabular}{|r|r|r|}
 \hline
@@ -413,8 +422,8 @@
 \end{table}
 
-As the local functions are implemented by emitting executable code on the stack~\cite{gcc-nested-func}, it eventually means that compiled code also has exponential run time. This problem has evident practical implications, as nested collection types are frequently used in real production code.
-
+Now since the right hand side of outermost cons is again a cons, recursive assertions are required. \VRef[Table]{t:NestedConsTest} shows when the compiler does not cache and reuse already resolved assertions, it becomes a problem, as each of these 4 pending assertions again asks for 4 more assertions one level below. Without caching, the number of resolved assertions grows exponentially, which is unnecessary since there are only $n+1$ different types involved. Even worse, this problem causes exponentially many wrapper functions to be generated at the backend, resulting in a huge binary. As the local functions are implemented by emitting executable code on the stack~\cite{gcc-nested-func}, it means that compiled code also has exponential run time. This problem has practical implications, as nested collection types are frequently used in real production code.
 
 \section{Analysis of type system correctness}
+\label{s:AnalysisTypeSystemCorrectness}
 
 In Moss' thesis~\cite[\S~4.1.2,~p.~45]{Moss19}, the author presents the following example:
@@ -433,10 +442,9 @@
 From the set of candidates whose parameter and argument types have been unified and whose assertions have been satisfied, those whose sub-expression interpretations have the smallest total cost of conversion are selected ... The total cost of conversion for each of these candidates is then calculated based on the implicit conversions and polymorphism involved in adapting the types of the sub-expression interpretations to the formal parameter types.
 \end{quote}
-With this model, the algorithm picks @g1@ in resolving the @f( g( 42 ) )@ call, which seems to be undesirable. 
-
-There are further evidence that shows the Bilson model is fundamentally incorrect, following the discussion of unbound return type in Section~\ref{s:UnboundReturnType}. By the conversion cost specification, a binding from a polymorphic type parameter to a concrete type incurs a polymorphic cost of 1. It remains unspecified \emph{when} the type parameters should become bound. When the parameterized types appear in the function parameters, they can be deduced from the argument type, and there is no ambiguity. In the unbound return case, however, the binding may happen at any stage in expression resolution, therefore it is impossible to define a unique local conversion cost. Note that type binding happens exactly once per parameter in resolving the entire expression, so the global binding cost is unambiguously 1. 
-
-As per the current compiler implementation, it does have a notable inconsistency in handling such case. For any unbound parameter that does \emph{not} come with an associated assertion, it remains unbound to the parent expression; for those that does however, they are immediately bound in the assertion resolution step, and concrete result types are used in the parent expressions. 
-
+With this model, the algorithm picks @g1@ in resolving the @f( g( 42 ) )@ call, which is undesirable.
+
+There is further evidence that shows the Bilson model is fundamentally incorrect, following the discussion of unbound return type in \VRef{s:UnboundReturnType}. By the conversion-cost specification, a binding from a polymorphic type-parameter to a concrete type incurs a polymorphic cost of 1. It remains unspecified \emph{when} the type parameters should become bound. When the parameterized types appear in function parameters, they can be deduced from the argument type, and there is no ambiguity. In the unbound return case, however, the binding may happen at any stage in expression resolution, therefore it is impossible to define a unique local conversion cost. Note that type binding happens exactly once per parameter in resolving the entire expression, so the global binding cost is unambiguously 1.
+
+In the current compiler implementation, there is a notable inconsistency in handling this case. For any unbound parameter that does \emph{not} come with an associated assertion, it remains unbound to the parent expression; for those that do, however, they are immediately bound in the assertion resolution step, and concrete result types are used in the parent expressions.
 Consider the following example:
 \begin{cfa}
@@ -444,23 +452,28 @@
 void h( int * );
 \end{cfa}
-The expression @h( f() )@ eventually has a total cost of 1 from binding (T: int), but in the eager resolution model, the cost of 1 may occur either at call to @f@ or at call to @h@, and with the assertion resolution triggering a binding, the local cost of @f()@ is (0 poly, 0 spec) with no assertions, but (1 poly, -1 spec) with an assertion:
-\begin{cfa}
-forall( dtype T | { void g( T * ); } ) T * f( void );
+The expression @h( f() )@ eventually has a total cost of 1 from binding (T: int), but in the eager-resolution model, the cost of 1 may occur either at the call to @f@ or at call to @h@, and with the assertion resolution triggering a binding, the local cost of @f()@ is (0 poly, 0 spec) with no assertions, but (1 poly, -1 spec) with an assertion:
+\begin{cfa}
+forall( dtype T | @{ void g( T * ); }@ ) T * f( void );
 void g( int * );
 void h( int * );
 \end{cfa}
-and that contradicts the principle that adding assertions should make expression cost lower. Furthermore, the time at which type binding and assertion resolution happens is an implementation detail of the compiler, but not a part of language definition. That means two compliant \CFA compilers, one performing immediate assertion resolution at each step, and one delaying assertion resolution on unbound types, can produce different expression costs and therefore different candidate selection, making the language rule itself partially undefined and therefore unsound. By the above reasoning, the updated cost model using global sum of costs should be accepted as the standard. It also allows the compiler to freely choose when to resolve assertions, as the sum of total costs is independent of that choice; more optimizations regarding assertion resolution can also be implemented.
+and that contradicts the principle that adding assertions should make expression cost lower. Furthermore, the time at which type binding and assertion resolution happens is an implementation detail of the compiler, not part of the language definition. That means two compliant \CFA compilers, one performing immediate assertion resolution at each step, and one delaying assertion resolution on unbound types, can produce different expression costs and therefore different candidate selection, making the language rule itself partially undefined, and therefore, unsound. By the above reasoning, the updated cost model using global sum of costs should be accepted as the standard. It also allows the compiler to freely choose when to resolve assertions, as the sum of total costs is independent of that choice; more optimizations regarding assertion resolution can also be implemented.
 
 
 \section{Timing results}
 
-For the timing results presented here, the \CFA compiler is built with gcc 9.3.0, and tested on a server machine running Ubuntu 20.04, 64GB RAM and 32-core 2.2 GHz CPU, results reported by the time command, and using only 8 cores in parallel such that the time is close to the case with 100\% CPU utilization on a single thread.
-
-On the most recent build, the \CFA standard library (~1.3 MB of source code) compiles in 4 minutes 47 seconds total processor time (single thread equivalent), with the slowest file taking 13 seconds. The test suite (178 test cases, ~2.2MB of source code) completes within 25 minutes total processor time,\footnote{Including a few runtime tests; total time spent in compilation is approximately 21 minutes.} with the slowest file taking 23 seconds. In contrast, the library build on old compiler takes 85 minutes total, 5 minutes for the slowest file. Full test suite takes too long with old compiler build and is therefore not run, but the slowest test cases take approximately 5 minutes. Overall, the most recent build compared to old build in April 2020, before the project started, is consistently faster by a factor of 20.
-
-Additionally, 6 selected \CFA source files with distinct features from library and test suite are used to test compiler performance after each of the optimizations are implemented. Test files are from the most recent build and run through C preprocessor to eliminate the factor of header file changes. The selected tests are:
-\begin{itemize}
-\item
-@lib/fstream@ (112 KB)\footnote{File sizes are after preprocessing, with no line information (\lstinline|gcc -E -P|).}: implementation of I/O library
+For the timing results presented here, the \CFA compiler is built with gcc 9.3.0, and tested on a server machine running Ubuntu 20.04, 64GB RAM and 32-core 2.2 GHz CPU.
+Timing is reported by the @time@ command and an experiment is run using 8 cores, where each core is at 100\% CPU utilization.
+
+On the most recent build, the \CFA standard library ($\approx$1.3 MB of source code) compiles in 4 minutes 47 seconds total processor time (single thread equivalent), with the slowest file taking 13 seconds. The test suite (178 test cases, $\approx$2.2MB of source code) completes within 25 minutes total processor time,
+% PAB: I do not understand this footnote.
+%\footnote{Including a few runtime tests; total time spent in compilation is approximately 21 minutes.}
+with the slowest file taking 23 seconds. In contrast, the library build with the old compiler takes 85 minutes total, 5 minutes for the slowest file. The full test-suite takes too long with old compiler build and is therefore not run, but the slowest test cases take approximately 5 minutes. Overall, the most recent build compared to an old build is consistently faster by a factor of 20.
+
+Additionally, 6 selected \CFA source files with distinct features from the library and test suite are used to illustrate the compiler performance change after each of the implemented optimizations. Test files are from the most recent build and run through the C preprocessor to expand header file, perform macro expansions, but no line number information (@gcc -E -P@).
+\VRef[Table]{t:SelectedFileByCompilerBuild} shows the selected tests:
+\begin{itemize}
+\item
+@lib/fstream@ (112 KB)
 \item
 @lib/mutex@ (166 KB): implementation of concurrency primitive
@@ -470,25 +483,29 @@
 @lib/stdlib@ (64 KB): type-safe wrapper to @void *@-based C standard library functions
 \item
-@test/ISO2@ (55 KB): application of I/O library
+@test/io2@ (55 KB): application of I/O library
 \item
 @test/thread@ (188 KB): application of threading library
 \end{itemize}
-
-The \CFA compiler builds are picked from git commit history that passed the test suite, and implement the optimizations incrementally:
-\begin{itemize}
-\item
-\#0 is the first working build of new AST data structure
+versus \CFA compiler builds picked from the git commit history that implement the optimizations incrementally:
+\begin{itemize}
+\item
+old resolver
+\item
+\#0 is the first working build of the new AST data structure
 \item
 \#1 implements special symbol table and argument-dependent lookup
 \item
-\#2 implements late assertion satisfaction
-\item
-\#3 implements revised function type representation
-\item
-\#4 skips pruning on expressions with function type (most recent build)
-\end{itemize}
-The old resolver with no memory sharing and none of the optimizations above is also tested.
-\begin{table}
+\#2 implements late assertion-satisfaction
+\item
+\#3 implements revised function-type representation
+\item
+\#4 skips pruning on expressions for function types (most recent build)
+\end{itemize}
+Reading left to right for a test shows the benefit of each optimization on the cost of compilation.
+
+\begin{table}[htb]
+\centering
 \caption{Compile time of selected files by compiler build, in seconds}
+\label{t:SelectedFileByCompilerBuild}
 \begin{tabular}{|l|r|r|r|r|r|r|}
 \hline
@@ -513,14 +530,13 @@
 \end{table}
 
-
 \section{Conclusion}
 
-Over the course of 8 months of active research and development in \CFA type system and compiler algorithm, performance of the reference \CFA compiler, cfa-cc, has been greatly improved, allowing mid-sized \CFA programs to be compiled and built reasonably fast. As there are also ongoing efforts in the team on building a standard library, evaluating the runtime performance, and attempting to incorporate \CFA with existing software written in C, this project is especially meaningful for practical purposes.
-
-Analysis conducted in the project were based significantly on heuristics and practical evidence, as the theoretical bounds and average cases for the expression resolution problem differ. This approach was difficult at start to follow, with an unacceptably slow compiler, since running the program through debugger and validation tools (\eg @gdb@, @valgrind@) adds another order of magnitude to run time, which was already in minutes. However, near the end of the project, many significant improvements have already been made and new optimizations can be tested immediately. The positive feedback in development cycle benefits the \CFA team as a whole, more than just for the compiler optimizations.
-
-Some potential issues of the language that may happen frequently in practice have been identified. Due to the time constraint and complex nature of these problems, a handful of them remain unsolved, but some constructive proposals are made. Notably, introducing a local assertion cache in the resolver is a common solution for a few remaining problems, so that should be the focus of work soon.
-
-The \CFA team are planning on a public alpha release of the language as the compiler performance becomes promising, and other parts of the system, such as a standard library, are also being enhanced. Ideally, the remaining problems should be resolved before release, and the solutions will also be integral to drafting a formal specification. 
+Over the course of 8 months of active research and development of the \CFA type system and compiler algorithms, performance of the reference \CFA compiler, cfa-cc, has been greatly improved. Now, mid-sized \CFA programs are compiled reasonably fast. Currently, there are ongoing efforts by the \CFA team to augment the standard library and evaluate its runtime performance, and incorporate \CFA with existing software written in C; therefore this project is especially meaningful for these practical purposes.
+
+Accomplishing this work was difficult. Analysis conducted in the project is based significantly on heuristics and practical evidence, as the theoretical bounds and average cases for the expression resolution problem differ. As well, the slowness of the initial compiler made attempts to understand why and where problems exist extremely difficult because both debugging and validation tools (\eg @gdb@, @valgrind@, @pref@) further slowed down compilation time. However, by the end of the project, I had found and fixed several significant problems and new optimizations are easier to introduce and test. The reduction in the development cycle benefits the \CFA team as a whole.
+
+Some potential issues of the language, which happen frequently in practice, have been identified. Due to the time constraint and complex nature of these problems, a handful of them remain unsolved, but some constructive proposals are made. Notably, introducing a local assertion cache in the resolver is a reasonable solution for a few remaining problems, so that should be the focus of future work.
+
+The \CFA team are planning on a public alpha release of the language as the compiler performance, given my recent improvements, is now useable. Other parts of the system, such as the standard library, have made significant gains due to the speed up in the development cycle. Ideally, the remaining problems should be resolved before release, and the solutions will also be integral to drafting a formal specification.
 
 \addcontentsline{toc}{section}{\refname}
Index: doc/theses/fangren_yu_COOP_S20/Report.tex
===================================================================
--- doc/theses/fangren_yu_COOP_S20/Report.tex	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ doc/theses/fangren_yu_COOP_S20/Report.tex	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -17,5 +17,5 @@
 \usepackage[usenames]{color}
 \input{common}                                          % common CFA document macros
-\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,pagebackref=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
+\usepackage[dvips,plainpages=false,pdfpagelabels,pdfpagemode=UseNone,colorlinks=true,linkcolor=blue,citecolor=blue,urlcolor=blue,pagebackref=true,breaklinks=true]{hyperref}
 \usepackage{breakurl}
 \urlstyle{sf}
Index: driver/cfa.cc
===================================================================
--- driver/cfa.cc	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ driver/cfa.cc	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -10,6 +10,6 @@
 // Created On       : Tue Aug 20 13:44:49 2002
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Tue Nov 17 14:27:28 2020
-// Update Count     : 440
+// Last Modified On : Sat Jan 16 07:30:19 2021
+// Update Count     : 442
 //
 
@@ -499,4 +499,5 @@
 		args[nargs++] = "-no-integrated-cpp";
 		args[nargs++] = "-Wno-deprecated";
+		args[nargs++] = "-Wno-strict-aliasing";			// casting from one type to another
 		#ifdef HAVE_CAST_FUNCTION_TYPE
 		args[nargs++] = "-Wno-cast-function-type";
Index: libcfa/prelude/builtins.c
===================================================================
--- libcfa/prelude/builtins.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/prelude/builtins.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,5 +18,5 @@
 // type that wraps a pointer and a destructor-like function - used in generating implicit destructor calls for struct members in user-defined functions
 // Note: needs to occur early, because it is used to generate destructor calls during code generation
-forall(dtype T)
+forall(T &)
 struct __Destructor {
 	T * object;
@@ -25,5 +25,5 @@
 
 // defined destructor in the case that non-generated code wants to use __Destructor
-forall(dtype T)
+forall(T &)
 static inline void ^?{}(__Destructor(T) & x) {
 	if (x.object && x.dtor) {
@@ -34,5 +34,5 @@
 // easy interface into __Destructor's destructor for easy codegen purposes
 extern "C" {
-	forall(dtype T)
+	forall(T &)
 	static inline void __destroy_Destructor(__Destructor(T) * dtor) {
 		^(*dtor){};
@@ -51,5 +51,5 @@
 void abort( const char fmt[], ... ) __attribute__ (( format(printf, 1, 2), __nothrow__, __leaf__, __noreturn__ ));
 
-forall(dtype T)
+forall(T &)
 static inline T & identity(T & i) {
 	return i;
@@ -64,10 +64,10 @@
 static inline void ^?{}($generator &) {}
 
-trait is_generator(dtype T) {
+trait is_generator(T &) {
       void main(T & this);
       $generator * get_generator(T & this);
 };
 
-forall(dtype T | is_generator(T))
+forall(T & | is_generator(T))
 static inline T & resume(T & gen) {
 	main(gen);
@@ -78,22 +78,22 @@
 
 static inline {
-	forall( dtype DT | { DT & ?+=?( DT &, one_t ); } )
+	forall( DT & | { DT & ?+=?( DT &, one_t ); } )
 	DT & ++?( DT & x ) { return x += 1; }
 
-	forall( dtype DT | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?+=?( DT &, one_t ); } )
+	forall( DT & | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?+=?( DT &, one_t ); } )
 	DT & ?++( DT & x ) { DT tmp = x; x += 1; return tmp; }
 
-	forall( dtype DT | { DT & ?-=?( DT &, one_t ); } )
+	forall( DT & | { DT & ?-=?( DT &, one_t ); } )
 	DT & --?( DT & x ) { return x -= 1; }
 
-	forall( dtype DT | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?-=?( DT &, one_t ); } )
+	forall( DT & | sized(DT) | { void ?{}( DT &, DT ); void ^?{}( DT & ); DT & ?-=?( DT &, one_t ); } )
 	DT & ?--( DT & x ) { DT tmp = x; x -= 1; return tmp; }
 
-	forall( dtype DT | { int ?!=?( const DT &, zero_t ); } )
+	forall( DT & | { int ?!=?( const DT &, zero_t ); } )
 	int !?( const DT & x ) { return !( x != 0 ); }
 } // distribution
 
 // universal typed pointer constant
-static inline forall( dtype DT ) DT * intptr( uintptr_t addr ) { return (DT *)addr; }
+static inline forall( DT & ) DT * intptr( uintptr_t addr ) { return (DT *)addr; }
 static inline forall( ftype FT ) FT * intptr( uintptr_t addr ) { return (FT *)addr; }
 
@@ -156,5 +156,5 @@
 #define __CFA_EXP_OVERFLOW__()
 
-static inline forall( otype OT | { void ?{}( OT & this, one_t ); OT ?*?( OT, OT ); } ) {
+static inline forall( OT | { void ?{}( OT & this, one_t ); OT ?*?( OT, OT ); } ) {
 	OT ?\?( OT ep, unsigned int y ) { __CFA_EXP__(); }
 	OT ?\?( OT ep, unsigned long int y ) { __CFA_EXP__(); }
Index: libcfa/prelude/prelude-gen.cc
===================================================================
--- libcfa/prelude/prelude-gen.cc	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/prelude/prelude-gen.cc	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -159,5 +159,5 @@
 int main() {
 	cout << "# 2 \"prelude.cfa\"  // needed for error messages from this file" << endl;
-	cout << "trait sized(dtype T) {};" << endl;
+	cout << "trait sized(T &) {};" << endl;
 
 	cout << "//////////////////////////" << endl;
@@ -264,5 +264,5 @@
 		for (auto cvq : qualifiersPair) {
 			for (auto is_vol : { "        ", "volatile" }) {
-				cout << "forall(dtype DT) void  ?{}(" << cvq.first << type << " * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
+				cout << "forall(DT &) void  ?{}(" << cvq.first << type << " * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
 			}
 		}
@@ -279,8 +279,8 @@
 	for (auto cvq : qualifiersSingle) {
 		for (auto is_vol : { "        ", "volatile" }) {
-			cout << "forall(dtype DT) void  ?{}(" << cvq << "  DT" << " * " << is_vol << " &);" << endl;
+			cout << "forall(DT &) void  ?{}(" << cvq << "  DT" << " * " << is_vol << " &);" << endl;
 		}
 		for (auto is_vol : { "        ", "volatile" }) {
-			cout << "forall(dtype DT) void ^?{}(" << cvq << "  DT" << " * " << is_vol << " &);" << endl;
+			cout << "forall(DT &) void ^?{}(" << cvq << "  DT" << " * " << is_vol << " &);" << endl;
 		}
 	}
@@ -290,5 +290,5 @@
 		for (auto is_vol : { "        ", "volatile" }) {
 			for (auto cvq : qualifiersSingle) {
-				cout << "forall(dtype DT) void ?{}( " << cvq << type << " * " << is_vol << " &, zero_t);" << endl;
+				cout << "forall(DT &) void ?{}( " << cvq << type << " * " << is_vol << " &, zero_t);" << endl;
 			}
 		}
@@ -317,5 +317,5 @@
 	for (auto op : pointerOperators) {
 		auto forall = [&op]() {
-			cout << "forall(dtype DT" << op.sized << ") ";
+			cout << "forall(DT &" << op.sized << ") ";
 		};
 		for (auto type : { "DT"/*, "void"*/ } ) {
@@ -408,8 +408,8 @@
 	for (auto is_vol : { "        ", "volatile" }) {
 		for (auto cvq : qualifiersPair) {
-				cout << "forall(dtype DT) " << cvq.first << "void * ?=?( " << cvq.first << "void * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
+				cout << "forall(DT &) " << cvq.first << "void * ?=?( " << cvq.first << "void * " << is_vol << " &, " << cvq.second << "DT *);" << endl;
 		}
 		for (auto cvq : qualifiersSingle) {
-			cout << "forall(dtype DT) " << cvq <<   "  DT * ?=?( " << cvq << "  DT * " << is_vol << " &, zero_t);" << endl;
+			cout << "forall(DT &) " << cvq <<   "  DT * ?=?( " << cvq << "  DT * " << is_vol << " &, zero_t);" << endl;
 		}
 	}
Index: libcfa/prelude/prelude.old.cf
===================================================================
--- libcfa/prelude/prelude.old.cf	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/prelude/prelude.old.cf	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -23,5 +23,5 @@
 // ------------------------------------------------------------
 
-trait sized(dtype T) {};
+trait sized(T &) {};
 
 // ------------------------------------------------------------
@@ -68,21 +68,21 @@
 long double _Complex	?--( long double _Complex & ),		?--( volatile long double _Complex & );
 
-forall( dtype T | sized(T) ) T *			 ?++(		     T *& );
-forall( dtype T | sized(T) ) const T *		 ?++( const	     T *& );
-forall( dtype T | sized(T) ) volatile T *		 ?++(	    volatile T *& );
-forall( dtype T | sized(T) ) const volatile T *	 ?++( const volatile T *& );
-forall( dtype T | sized(T) ) T *			 ?--(		     T *& );
-forall( dtype T | sized(T) ) const T *		 ?--( const	     T *& );
-forall( dtype T | sized(T) ) volatile T *		 ?--(	    volatile T *& );
-forall( dtype T | sized(T) ) const volatile T *	 ?--( const volatile T *& );
-
-forall( dtype T | sized(T) ) T &		 ?[?](		      T *,	    ptrdiff_t );
-forall( dtype T | sized(T) ) const T &	 ?[?]( const	      T *,	    ptrdiff_t );
-forall( dtype T | sized(T) ) volatile T &	 ?[?](       volatile T *,	    ptrdiff_t );
-forall( dtype T | sized(T) ) const volatile T & ?[?]( const volatile T *,	    ptrdiff_t );
-forall( dtype T | sized(T) ) T &		 ?[?](		ptrdiff_t,		  T * );
-forall( dtype T | sized(T) ) const T &	 ?[?](		ptrdiff_t, const	  T * );
-forall( dtype T | sized(T) ) volatile T &	 ?[?](		ptrdiff_t,	 volatile T * );
-forall( dtype T | sized(T) ) const volatile T & ?[?](		ptrdiff_t, const volatile T * );
+forall( T & | sized(T) ) T *			 ?++(		     T *& );
+forall( T & | sized(T) ) const T *		 ?++( const	     T *& );
+forall( T & | sized(T) ) volatile T *		 ?++(	    volatile T *& );
+forall( T & | sized(T) ) const volatile T *	 ?++( const volatile T *& );
+forall( T & | sized(T) ) T *			 ?--(		     T *& );
+forall( T & | sized(T) ) const T *		 ?--( const	     T *& );
+forall( T & | sized(T) ) volatile T *		 ?--(	    volatile T *& );
+forall( T & | sized(T) ) const volatile T *	 ?--( const volatile T *& );
+
+forall( T & | sized(T) ) T &		 ?[?](		      T *,	    ptrdiff_t );
+forall( T & | sized(T) ) const T &	 ?[?]( const	      T *,	    ptrdiff_t );
+forall( T & | sized(T) ) volatile T &	 ?[?](       volatile T *,	    ptrdiff_t );
+forall( T & | sized(T) ) const volatile T & ?[?]( const volatile T *,	    ptrdiff_t );
+forall( T & | sized(T) ) T &		 ?[?](		ptrdiff_t,		  T * );
+forall( T & | sized(T) ) const T &	 ?[?](		ptrdiff_t, const	  T * );
+forall( T & | sized(T) ) volatile T &	 ?[?](		ptrdiff_t,	 volatile T * );
+forall( T & | sized(T) ) const volatile T & ?[?](		ptrdiff_t, const volatile T * );
 
 // ------------------------------------------------------------
@@ -107,17 +107,17 @@
 long double _Complex	++?( long double _Complex & ),		--?( long double _Complex & );
 
-forall( dtype T | sized(T) ) T *			 ++?(		     T *& );
-forall( dtype T | sized(T) ) const T *		 ++?( const	     T *& );
-forall( dtype T | sized(T) ) volatile T *		 ++?(	    volatile T *& );
-forall( dtype T | sized(T) ) const volatile T *	 ++?( const volatile T *& );
-forall( dtype T | sized(T) ) T *			 --?(		     T *& );
-forall( dtype T | sized(T) ) const T *		 --?( const	     T *& );
-forall( dtype T | sized(T) ) volatile T *		 --?(	    volatile T *& );
-forall( dtype T | sized(T) ) const volatile T *	 --?( const volatile T *& );
-
-forall( dtype T | sized(T) ) T &		 *?(		     T * );
-forall( dtype T | sized(T) ) const T &		 *?( const	     T * );
-forall( dtype T | sized(T) ) volatile T &	 *?(       volatile  T * );
-forall( dtype T | sized(T) ) const volatile T & *?( const volatile  T * );
+forall( T & | sized(T) ) T *			 ++?(		     T *& );
+forall( T & | sized(T) ) const T *		 ++?( const	     T *& );
+forall( T & | sized(T) ) volatile T *		 ++?(	    volatile T *& );
+forall( T & | sized(T) ) const volatile T *	 ++?( const volatile T *& );
+forall( T & | sized(T) ) T *			 --?(		     T *& );
+forall( T & | sized(T) ) const T *		 --?( const	     T *& );
+forall( T & | sized(T) ) volatile T *		 --?(	    volatile T *& );
+forall( T & | sized(T) ) const volatile T *	 --?( const volatile T *& );
+
+forall( T & | sized(T) ) T &		 *?(		     T * );
+forall( T & | sized(T) ) const T &		 *?( const	     T * );
+forall( T & | sized(T) ) volatile T &	 *?(       volatile  T * );
+forall( T & | sized(T) ) const volatile T & *?( const volatile  T * );
 forall( ftype FT ) FT &		 *?( FT * );
 
@@ -142,8 +142,8 @@
 		!?( float _Complex ),		!?( double _Complex ),		!?( long double _Complex );
 
-forall( dtype DT ) int !?(                DT * );
-forall( dtype DT ) int !?( const          DT * );
-forall( dtype DT ) int !?(       volatile DT * );
-forall( dtype DT ) int !?( const volatile DT * );
+forall( DT & ) int !?(                DT * );
+forall( DT & ) int !?( const          DT * );
+forall( DT & ) int !?(       volatile DT * );
+forall( DT & ) int !?( const volatile DT * );
 forall( ftype FT ) int !?( FT * );
 
@@ -191,17 +191,17 @@
 long double _Complex	?+?( long double _Complex, long double _Complex ),	?-?( long double _Complex, long double _Complex );
 
-forall( dtype T | sized(T) ) T *		?+?(		    T *,	  ptrdiff_t );
-forall( dtype T | sized(T) ) T *		?+?(	      ptrdiff_t,		T * );
-forall( dtype T | sized(T) ) const T *		?+?( const	    T *,	  ptrdiff_t );
-forall( dtype T | sized(T) ) const T *		?+?(	      ptrdiff_t, const		T * );
-forall( dtype T | sized(T) ) volatile T *	?+?(	   volatile T *,	  ptrdiff_t );
-forall( dtype T | sized(T) ) volatile T *	?+?(	      ptrdiff_t,       volatile T * );
-forall( dtype T | sized(T) ) const volatile T *	?+?( const volatile T *,	  ptrdiff_t );
-forall( dtype T | sized(T) ) const volatile T *	?+?(	      ptrdiff_t, const volatile T * );
-forall( dtype T | sized(T) ) T *		?-?(		    T *,	  ptrdiff_t );
-forall( dtype T | sized(T) ) const T *		?-?( const	    T *,	  ptrdiff_t );
-forall( dtype T | sized(T) ) volatile T *	?-?(	   volatile T *,	  ptrdiff_t );
-forall( dtype T | sized(T) ) const volatile T *	?-?( const volatile T *,	  ptrdiff_t );
-forall( dtype T | sized(T) ) ptrdiff_t		?-?( const volatile T *, const volatile T * );
+forall( T & | sized(T) ) T *		?+?(		    T *,	  ptrdiff_t );
+forall( T & | sized(T) ) T *		?+?(	      ptrdiff_t,		T * );
+forall( T & | sized(T) ) const T *		?+?( const	    T *,	  ptrdiff_t );
+forall( T & | sized(T) ) const T *		?+?(	      ptrdiff_t, const		T * );
+forall( T & | sized(T) ) volatile T *	?+?(	   volatile T *,	  ptrdiff_t );
+forall( T & | sized(T) ) volatile T *	?+?(	      ptrdiff_t,       volatile T * );
+forall( T & | sized(T) ) const volatile T *	?+?( const volatile T *,	  ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *	?+?(	      ptrdiff_t, const volatile T * );
+forall( T & | sized(T) ) T *		?-?(		    T *,	  ptrdiff_t );
+forall( T & | sized(T) ) const T *		?-?( const	    T *,	  ptrdiff_t );
+forall( T & | sized(T) ) volatile T *	?-?(	   volatile T *,	  ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *	?-?( const volatile T *,	  ptrdiff_t );
+forall( T & | sized(T) ) ptrdiff_t		?-?( const volatile T *, const volatile T * );
 
 // ------------------------------------------------------------
@@ -255,23 +255,23 @@
 	   ?>?( long double, long double ),				?>=?( long double, long double );
 
-forall( dtype DT ) signed int ?<?(                 DT *,                DT * );
-forall( dtype DT ) signed int ?<?(  const          DT *, const          DT * );
-forall( dtype DT ) signed int ?<?(        volatile DT *,       volatile DT * );
-forall( dtype DT ) signed int ?<?(  const volatile DT *, const volatile DT * );
-
-forall( dtype DT ) signed int ?>?(                 DT *,                DT * );
-forall( dtype DT ) signed int ?>?(  const          DT *, const          DT * );
-forall( dtype DT ) signed int ?>?(        volatile DT *,       volatile DT * );
-forall( dtype DT ) signed int ?>?(  const volatile DT *, const volatile DT * );
-
-forall( dtype DT ) signed int ?<=?(                 DT *,                DT * );
-forall( dtype DT ) signed int ?<=?(  const          DT *, const          DT * );
-forall( dtype DT ) signed int ?<=?(        volatile DT *,       volatile DT * );
-forall( dtype DT ) signed int ?<=?( const volatile DT *, const volatile DT * );
-
-forall( dtype DT ) signed int ?>=?(                 DT *,                DT * );
-forall( dtype DT ) signed int ?>=?(  const          DT *, const          DT * );
-forall( dtype DT ) signed int ?>=?(        volatile DT *,       volatile DT * );
-forall( dtype DT ) signed int ?>=?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?<?(                 DT *,                DT * );
+forall( DT & ) signed int ?<?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?<?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?<?(  const volatile DT *, const volatile DT * );
+
+forall( DT & ) signed int ?>?(                 DT *,                DT * );
+forall( DT & ) signed int ?>?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?>?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?>?(  const volatile DT *, const volatile DT * );
+
+forall( DT & ) signed int ?<=?(                 DT *,                DT * );
+forall( DT & ) signed int ?<=?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?<=?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?<=?( const volatile DT *, const volatile DT * );
+
+forall( DT & ) signed int ?>=?(                 DT *,                DT * );
+forall( DT & ) signed int ?>=?(  const          DT *, const          DT * );
+forall( DT & ) signed int ?>=?(        volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?>=?( const volatile DT *, const volatile DT * );
 
 // ------------------------------------------------------------
@@ -302,13 +302,13 @@
 signed int ?==?( one_t, one_t ),							?!=?( one_t, one_t );
 
-forall( dtype DT ) signed int ?==?(		   DT *,		DT * );
-forall( dtype DT ) signed int ?==?( const	   DT *, const		DT * );
-forall( dtype DT ) signed int ?==?(       volatile DT *,       volatile DT * );
-forall( dtype DT ) signed int ?==?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?==?(		   DT *,		DT * );
+forall( DT & ) signed int ?==?( const	   DT *, const		DT * );
+forall( DT & ) signed int ?==?(       volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?==?( const volatile DT *, const volatile DT * );
 forall( ftype FT ) signed int ?==?( FT *, FT * );
-forall( dtype DT ) signed int ?!=?(		   DT *,		DT * );
-forall( dtype DT ) signed int ?!=?( const	   DT *, const		DT * );
-forall( dtype DT ) signed int ?!=?(       volatile DT *,       volatile DT * );
-forall( dtype DT ) signed int ?!=?( const volatile DT *, const volatile DT * );
+forall( DT & ) signed int ?!=?(		   DT *,		DT * );
+forall( DT & ) signed int ?!=?( const	   DT *, const		DT * );
+forall( DT & ) signed int ?!=?(       volatile DT *,       volatile DT * );
+forall( DT & ) signed int ?!=?( const volatile DT *, const volatile DT * );
 forall( ftype FT ) signed int ?!=?( FT *, FT * );
 
@@ -376,73 +376,73 @@
 
 forall( ftype FT ) FT *			?=?( FT *&, FT * );
-forall( ftype FT ) FT *			?=?( FT * volatile &, FT * );
-
-forall( dtype DT ) DT *			?=?(		     DT *	   &,			DT * );
-forall( dtype DT ) DT *			?=?(		     DT * volatile &,			DT * );
-forall( dtype DT ) const DT *		?=?( const	     DT *	   &,			DT * );
-forall( dtype DT ) const DT *		?=?( const	     DT * volatile &,			DT * );
-forall( dtype DT ) const DT *		?=?( const	     DT *	   &, const		DT * );
-forall( dtype DT ) const DT *		?=?( const	     DT * volatile &, const		DT * );
-forall( dtype DT ) volatile DT *	?=?(	   volatile  DT *	   &,			DT * );
-forall( dtype DT ) volatile DT *	?=?(	   volatile  DT * volatile &,			DT * );
-forall( dtype DT ) volatile DT *	?=?(	   volatile  DT *	   &,	    volatile	DT * );
-forall( dtype DT ) volatile DT *	?=?(	   volatile  DT * volatile &,	    volatile	DT * );
-
-forall( dtype DT ) const volatile DT *	?=?( const volatile  DT *	   &,			DT * );
-forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &,			DT * );
-forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *	   &, const		DT * );
-forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &, const		DT * );
-forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *	   &,	    volatile	DT * );
-forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &,	    volatile	DT * );
-forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT *	   &, const volatile	DT * );
-forall( dtype DT ) const volatile DT *  ?=?( const volatile  DT * volatile &, const volatile	DT * );
-
-forall( dtype DT ) void *		 ?=?(		     void *	     &,			DT * );
-forall( dtype DT ) void *		 ?=?(		     void * volatile &,			DT * );
-forall( dtype DT ) const void *		 ?=?( const	     void *	     &,			DT * );
-forall( dtype DT ) const void *		 ?=?( const	     void * volatile &,			DT * );
-forall( dtype DT ) const void *		 ?=?( const	     void *	     &, const		DT * );
-forall( dtype DT ) const void *		 ?=?( const	     void * volatile &, const		DT * );
-forall( dtype DT ) volatile void *	 ?=?(	    volatile void *	     &,			DT * );
-forall( dtype DT ) volatile void *	 ?=?(	    volatile void * volatile &,			DT * );
-forall( dtype DT ) volatile void *	 ?=?(	    volatile void *	     &,	      volatile	DT * );
-forall( dtype DT ) volatile void *	 ?=?(	    volatile void * volatile &,	      volatile	DT * );
-forall( dtype DT ) const volatile void * ?=?( const volatile void *	     &,			DT * );
-forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &,			DT * );
-forall( dtype DT ) const volatile void * ?=?( const volatile void *	     &, const		DT * );
-forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &, const		DT * );
-forall( dtype DT ) const volatile void * ?=?( const volatile void *	     &,	      volatile	DT * );
-forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &,	      volatile	DT * );
-forall( dtype DT ) const volatile void * ?=?( const volatile void *	     &, const volatile	DT * );
-forall( dtype DT ) const volatile void * ?=?( const volatile void * volatile &, const volatile	DT * );
+forall( ftyep FT ) FT *			?=?( FT * volatile &, FT * );
+
+forall( DT & ) DT *			?=?(		     DT *	   &,			DT * );
+forall( DT & ) DT *			?=?(		     DT * volatile &,			DT * );
+forall( DT & ) const DT *		?=?( const	     DT *	   &,			DT * );
+forall( DT & ) const DT *		?=?( const	     DT * volatile &,			DT * );
+forall( DT & ) const DT *		?=?( const	     DT *	   &, const		DT * );
+forall( DT & ) const DT *		?=?( const	     DT * volatile &, const		DT * );
+forall( DT & ) volatile DT *	?=?(	   volatile  DT *	   &,			DT * );
+forall( DT & ) volatile DT *	?=?(	   volatile  DT * volatile &,			DT * );
+forall( DT & ) volatile DT *	?=?(	   volatile  DT *	   &,	    volatile	DT * );
+forall( DT & ) volatile DT *	?=?(	   volatile  DT * volatile &,	    volatile	DT * );
+
+forall( DT & ) const volatile DT *	?=?( const volatile  DT *	   &,			DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &,			DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *	   &, const		DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &, const		DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *	   &,	    volatile	DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &,	    volatile	DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT *	   &, const volatile	DT * );
+forall( DT & ) const volatile DT *  ?=?( const volatile  DT * volatile &, const volatile	DT * );
+
+forall( DT & ) void *		 ?=?(		     void *	     &,			DT * );
+forall( DT & ) void *		 ?=?(		     void * volatile &,			DT * );
+forall( DT & ) const void *		 ?=?( const	     void *	     &,			DT * );
+forall( DT & ) const void *		 ?=?( const	     void * volatile &,			DT * );
+forall( DT & ) const void *		 ?=?( const	     void *	     &, const		DT * );
+forall( DT & ) const void *		 ?=?( const	     void * volatile &, const		DT * );
+forall( DT & ) volatile void *	 ?=?(	    volatile void *	     &,			DT * );
+forall( DT & ) volatile void *	 ?=?(	    volatile void * volatile &,			DT * );
+forall( DT & ) volatile void *	 ?=?(	    volatile void *	     &,	      volatile	DT * );
+forall( DT & ) volatile void *	 ?=?(	    volatile void * volatile &,	      volatile	DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *	     &,			DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &,			DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *	     &, const		DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &, const		DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *	     &,	      volatile	DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &,	      volatile	DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void *	     &, const volatile	DT * );
+forall( DT & ) const volatile void * ?=?( const volatile void * volatile &, const volatile	DT * );
 
 //forall( dtype DT ) DT *			?=?(		    DT *	  &, zero_t );
 //forall( dtype DT ) DT *			?=?(		    DT * volatile &, zero_t );
-forall( dtype DT ) const DT *		?=?( const	    DT *	  &, zero_t );
-forall( dtype DT ) const DT *		?=?( const	    DT * volatile &, zero_t );
+forall( DT & ) const DT *		?=?( const	    DT *	  &, zero_t );
+forall( DT & ) const DT *		?=?( const	    DT * volatile &, zero_t );
 //forall( dtype DT ) volatile DT *	?=?( volatile	    DT *	  &, zero_t );
 //forall( dtype DT ) volatile DT *	?=?( volatile	    DT * volatile &, zero_t );
-forall( dtype DT ) const volatile DT *	?=?( const volatile DT *	  &, zero_t );
-forall( dtype DT ) const volatile DT *	?=?( const volatile DT * volatile &, zero_t );
+forall( DT & ) const volatile DT *	?=?( const volatile DT *	  &, zero_t );
+forall( DT & ) const volatile DT *	?=?( const volatile DT * volatile &, zero_t );
 
 forall( ftype FT ) FT *			?=?( FT *	   &, zero_t );
 forall( ftype FT ) FT *			?=?( FT * volatile &, zero_t );
 
-forall( dtype T | sized(T) ) T *		?+=?(		     T *	  &, ptrdiff_t );
-forall( dtype T | sized(T) ) T *		?+=?(		     T * volatile &, ptrdiff_t );
-forall( dtype T | sized(T) ) const T *		?+=?( const	     T *	  &, ptrdiff_t );
-forall( dtype T | sized(T) ) const T *		?+=?( const	     T * volatile &, ptrdiff_t );
-forall( dtype T | sized(T) ) volatile T *	?+=?(	    volatile T *	  &, ptrdiff_t );
-forall( dtype T | sized(T) ) volatile T *	?+=?(	    volatile T * volatile &, ptrdiff_t );
-forall( dtype T | sized(T) ) const volatile T *	?+=?( const volatile T *	  &, ptrdiff_t );
-forall( dtype T | sized(T) ) const volatile T *	?+=?( const volatile T * volatile &, ptrdiff_t );
-forall( dtype T | sized(T) ) T *		?-=?(		     T *	  &, ptrdiff_t );
-forall( dtype T | sized(T) ) T *		?-=?(		     T * volatile &, ptrdiff_t );
-forall( dtype T | sized(T) ) const T *		?-=?( const	     T *	  &, ptrdiff_t );
-forall( dtype T | sized(T) ) const T *		?-=?( const	     T * volatile &, ptrdiff_t );
-forall( dtype T | sized(T) ) volatile T *	?-=?(	    volatile T *	  &, ptrdiff_t );
-forall( dtype T | sized(T) ) volatile T *	?-=?(	    volatile T * volatile &, ptrdiff_t );
-forall( dtype T | sized(T) ) const volatile T *	?-=?( const volatile T *	  &, ptrdiff_t );
-forall( dtype T | sized(T) ) const volatile T *	?-=?( const volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) T *		?+=?(		     T *	  &, ptrdiff_t );
+forall( T & | sized(T) ) T *		?+=?(		     T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const T *		?+=?( const	     T *	  &, ptrdiff_t );
+forall( T & | sized(T) ) const T *		?+=?( const	     T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *	?+=?(	    volatile T *	  &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *	?+=?(	    volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *	?+=?( const volatile T *	  &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *	?+=?( const volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) T *		?-=?(		     T *	  &, ptrdiff_t );
+forall( T & | sized(T) ) T *		?-=?(		     T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const T *		?-=?( const	     T *	  &, ptrdiff_t );
+forall( T & | sized(T) ) const T *		?-=?( const	     T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *	?-=?(	    volatile T *	  &, ptrdiff_t );
+forall( T & | sized(T) ) volatile T *	?-=?(	    volatile T * volatile &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *	?-=?( const volatile T *	  &, ptrdiff_t );
+forall( T & | sized(T) ) const volatile T *	?-=?( const volatile T * volatile &, ptrdiff_t );
 
 _Bool			?=?( _Bool &, _Bool ),					?=?( volatile _Bool &, _Bool );
@@ -723,30 +723,30 @@
 forall( ftype FT ) void ?{}( FT * volatile &, FT * );
 
-forall( dtype DT ) void ?{}(		     DT *	   &,			DT * );
-forall( dtype DT ) void ?{}( const	     DT *	   &,			DT * );
-forall( dtype DT ) void ?{}( const	     DT *	   &, const		DT * );
-forall( dtype DT ) void ?{}(	   volatile  DT *	   &,			DT * );
-forall( dtype DT ) void ?{}(	   volatile  DT *	   &,	    volatile	DT * );
-forall( dtype DT ) void ?{}( const volatile  DT *	   &,			DT * );
-forall( dtype DT ) void ?{}( const volatile  DT *	   &, const		DT * );
-forall( dtype DT ) void ?{}( const volatile  DT *	   &,	    volatile	DT * );
-forall( dtype DT ) void ?{}( const volatile  DT *	   &, const volatile	DT * );
-
-forall( dtype DT ) void ?{}(		     void *	     &,			DT * );
-forall( dtype DT ) void ?{}( const	     void *	     &,			DT * );
-forall( dtype DT ) void ?{}( const	     void *	     &, const		DT * );
-forall( dtype DT ) void ?{}(	    volatile void *	     &,			DT * );
-forall( dtype DT ) void ?{}(	    volatile void *	     &,	      volatile	DT * );
-forall( dtype DT ) void ?{}( const volatile void *	     &,			DT * );
-forall( dtype DT ) void ?{}( const volatile void *	     &, const		DT * );
-forall( dtype DT ) void ?{}( const volatile void *	     &,	      volatile	DT * );
-forall( dtype DT ) void ?{}( const volatile void *	     &, const volatile	DT * );
+forall( DT & ) void ?{}(		     DT *	   &,			DT * );
+forall( DT & ) void ?{}( const	     DT *	   &,			DT * );
+forall( DT & ) void ?{}( const	     DT *	   &, const		DT * );
+forall( DT & ) void ?{}(	   volatile  DT *	   &,			DT * );
+forall( DT & ) void ?{}(	   volatile  DT *	   &,	    volatile	DT * );
+forall( DT & ) void ?{}( const volatile  DT *	   &,			DT * );
+forall( DT & ) void ?{}( const volatile  DT *	   &, const		DT * );
+forall( DT & ) void ?{}( const volatile  DT *	   &,	    volatile	DT * );
+forall( DT & ) void ?{}( const volatile  DT *	   &, const volatile	DT * );
+
+forall( DT & ) void ?{}(		     void *	     &,			DT * );
+forall( DT & ) void ?{}( const	     void *	     &,			DT * );
+forall( DT & ) void ?{}( const	     void *	     &, const		DT * );
+forall( DT & ) void ?{}(	    volatile void *	     &,			DT * );
+forall( DT & ) void ?{}(	    volatile void *	     &,	      volatile	DT * );
+forall( DT & ) void ?{}( const volatile void *	     &,			DT * );
+forall( DT & ) void ?{}( const volatile void *	     &, const		DT * );
+forall( DT & ) void ?{}( const volatile void *	     &,	      volatile	DT * );
+forall( DT & ) void ?{}( const volatile void *	     &, const volatile	DT * );
 
 //forall( dtype DT ) void ?{}(		    DT *	  &, zero_t );
 //forall( dtype DT ) void ?{}(		    DT * volatile &, zero_t );
-forall( dtype DT ) void ?{}( const	    DT *	  &, zero_t );
+forall( DT & ) void ?{}( const	    DT *	  &, zero_t );
 //forall( dtype DT ) void ?{}( volatile	    DT *	  &, zero_t );
 //forall( dtype DT ) void ?{}( volatile	    DT * volatile &, zero_t );
-forall( dtype DT ) void ?{}( const volatile DT *	  &, zero_t );
+forall( DT & ) void ?{}( const volatile DT *	  &, zero_t );
 
 forall( ftype FT ) void	?{}( FT *	   &, zero_t );
@@ -755,8 +755,8 @@
 forall( ftype FT ) void	?{}( FT *	   & );
 
-forall( dtype DT ) void	?{}(		     DT *	   &);
-forall( dtype DT ) void	?{}( const	     DT *	   &);
-forall( dtype DT ) void	?{}(	   volatile  DT *	   &);
-forall( dtype DT ) void ?{}( const volatile  DT *	   &);
+forall( DT & ) void	?{}(		     DT *	   &);
+forall( DT & ) void	?{}( const	     DT *	   &);
+forall( DT & ) void	?{}(	   volatile  DT *	   &);
+forall( DT & ) void ?{}( const volatile  DT *	   &);
 
 void 	?{}(		    void *	    &);
@@ -768,8 +768,8 @@
 forall( ftype FT ) void	^?{}( FT *	   & );
 
-forall( dtype DT ) void	^?{}(		     DT *	   &);
-forall( dtype DT ) void	^?{}( const	     DT *	   &);
-forall( dtype DT ) void	^?{}(	   volatile  DT *	   &);
-forall( dtype DT ) void ^?{}( const volatile  DT *	   &);
+forall( DT & ) void	^?{}(		     DT *	   &);
+forall( DT & ) void	^?{}( const	     DT *	   &);
+forall( DT & ) void	^?{}(	   volatile  DT *	   &);
+forall( DT & ) void ^?{}( const volatile  DT *	   &);
 
 void ^?{}(		    void *	    &);
Index: libcfa/prelude/sync-builtins.cf
===================================================================
--- libcfa/prelude/sync-builtins.cf	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/prelude/sync-builtins.cf	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -206,5 +206,5 @@
 _Bool __sync_bool_compare_and_swap(volatile unsigned __int128 *, unsigned __int128, unsigned __int128,...);
 #endif
-forall(dtype T) _Bool __sync_bool_compare_and_swap(T * volatile *, T *, T*, ...);
+forall(T &) _Bool __sync_bool_compare_and_swap(T * volatile *, T *, T*, ...);
 
 char __sync_val_compare_and_swap(volatile char *, char, char,...);
@@ -223,5 +223,5 @@
 unsigned __int128 __sync_val_compare_and_swap(volatile unsigned __int128 *, unsigned __int128, unsigned __int128,...);
 #endif
-forall(dtype T) T * __sync_val_compare_and_swap(T * volatile *, T *, T*,...);
+forall(T &) T * __sync_val_compare_and_swap(T * volatile *, T *, T*,...);
 
 char __sync_lock_test_and_set(volatile char *, char,...);
@@ -326,6 +326,6 @@
 void __atomic_exchange(volatile unsigned __int128 *, volatile unsigned __int128 *, volatile unsigned __int128 *, int);
 #endif
-forall(dtype T) T * __atomic_exchange_n(T * volatile *, T *, int);
-forall(dtype T) void __atomic_exchange(T * volatile *, T * volatile *, T * volatile *, int);
+forall(T &) T * __atomic_exchange_n(T * volatile *, T *, int);
+forall(T &) void __atomic_exchange(T * volatile *, T * volatile *, T * volatile *, int);
 
 _Bool __atomic_load_n(const volatile _Bool *, int);
@@ -359,6 +359,6 @@
 void __atomic_load(const volatile unsigned __int128 *, volatile unsigned __int128 *, int);
 #endif
-forall(dtype T) T * __atomic_load_n(T * const volatile *, int);
-forall(dtype T) void __atomic_load(T * const volatile *, T **, int);
+forall(T &) T * __atomic_load_n(T * const volatile *, int);
+forall(T &) void __atomic_load(T * const volatile *, T **, int);
 
 _Bool __atomic_compare_exchange_n(volatile char *, char *, char, _Bool, int, int);
@@ -390,6 +390,6 @@
 _Bool __atomic_compare_exchange   (volatile unsigned __int128 *, unsigned __int128 *, unsigned __int128 *, _Bool, int, int);
 #endif
-forall(dtype T) _Bool __atomic_compare_exchange_n (T * volatile *, T **, T*, _Bool, int, int);
-forall(dtype T) _Bool __atomic_compare_exchange   (T * volatile *, T **, T**, _Bool, int, int);
+forall(T &) _Bool __atomic_compare_exchange_n (T * volatile *, T **, T*, _Bool, int, int);
+forall(T &) _Bool __atomic_compare_exchange   (T * volatile *, T **, T**, _Bool, int, int);
 
 void __atomic_store_n(volatile _Bool *, _Bool, int);
@@ -423,6 +423,6 @@
 void __atomic_store(volatile unsigned __int128 *, unsigned __int128 *, int);
 #endif
-forall(dtype T) void __atomic_store_n(T * volatile *, T *, int);
-forall(dtype T) void __atomic_store(T * volatile *, T **, int);
+forall(T &) void __atomic_store_n(T * volatile *, T *, int);
+forall(T &) void __atomic_store(T * volatile *, T **, int);
 
 char __atomic_add_fetch  (volatile char *, char, int);
Index: libcfa/src/bitmanip.hfa
===================================================================
--- libcfa/src/bitmanip.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bitmanip.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -100,5 +100,5 @@
 	unsigned long long int floor2( unsigned long long int n, unsigned long long int align ) { verify( is_pow2( align ) ); return n & -align; }
 
-	// forall( otype T | { T ?&?( T, T ); T -?( T ); } )
+	// forall( T | { T ?&?( T, T ); T -?( T ); } )
 	// T floor2( T n, T align ) { verify( is_pow2( align ) ); return n & -align; }
 
@@ -115,5 +115,5 @@
 	unsigned long long int ceiling2( unsigned long long int n, unsigned long long int align ) { verify( is_pow2( align ) ); return -floor2( -n, align ); }
 
-	// forall( otype T | { T floor2( T, T ); T -?( T ); } )
+	// forall( T | { T floor2( T, T ); T -?( T ); } )
 	// T ceiling2( T n, T align ) { verify( is_pow2( align ) ); return -floor2( -n, align ); }
 } // distribution
Index: libcfa/src/bits/algorithm.hfa
===================================================================
--- libcfa/src/bits/algorithm.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bits/algorithm.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -17,12 +17,12 @@
 
 #ifdef SAFE_SORT
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort2( T * arr );
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort3( T * arr );
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort4( T * arr );
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort5( T * arr );
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort6( T * arr );
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sortN( T * arr, size_t dim );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort2( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort3( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort4( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort5( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sort6( T * arr );
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } ) static inline void __libcfa_small_sortN( T * arr, size_t dim );
 
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort( T * arr, size_t dim ) {
 	switch( dim ) {
@@ -41,10 +41,10 @@
 #define SWAP(x,y) { T a = min(arr[x], arr[y]); T b = max(arr[x], arr[y]); arr[x] = a; arr[y] = b;}
 
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort2( T * arr ) {
 	SWAP(0, 1);
 }
 
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort3( T * arr ) {
 	SWAP(1, 2);
@@ -53,5 +53,5 @@
 }
 
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort4( T * arr ) {
 	SWAP(0, 1);
@@ -62,5 +62,5 @@
 }
 
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort5( T * arr ) {
 	SWAP(0, 1);
@@ -75,5 +75,5 @@
 }
 
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sort6( T * arr ) {
 	SWAP(1, 2);
@@ -91,5 +91,5 @@
 }
 
-forall( otype T | {  int ?<?( T, T ); int ?>?( T, T ); } )
+forall( T | {  int ?<?( T, T ); int ?>?( T, T ); } )
 static inline void __libcfa_small_sortN( T * arr, size_t dim ) {
 	int i, j;
@@ -112,5 +112,5 @@
 static inline void __libcfa_small_sortN( void* * arr, size_t dim );
 
-forall( dtype T )
+forall( T & )
 static inline void __libcfa_small_sort( T* * arr, size_t dim ) {
 	switch( dim ) {
Index: libcfa/src/bits/collection.hfa
===================================================================
--- libcfa/src/bits/collection.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bits/collection.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -31,5 +31,5 @@
 
 	// // wrappers to make Collection have T
-	// forall( dtype T ) {
+	// forall( T & ) {
 	// 	T *& Next( T * n ) {
 	// 		return (T *)Next( (Colable *)n );
@@ -38,5 +38,5 @@
 } // distribution
 
-forall( dtype T | { T *& Next ( T * ); } ) {
+static inline forall( T & | { T *& Next ( T * ); } ) {
 	bool listed( T * n ) {
 		return Next( n ) != 0p;
@@ -76,5 +76,5 @@
 	} // post: elts = null
 
-	forall( dtype T ) {
+	forall( T & ) {
 		T * Curr( ColIter & ci ) with( ci ) {
 			return (T *)curr;
Index: libcfa/src/bits/containers.hfa
===================================================================
--- libcfa/src/bits/containers.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bits/containers.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -23,5 +23,5 @@
 
 #ifdef __cforall
-	forall(dtype T)
+	forall(T &)
 #else
 	#define T void
@@ -40,23 +40,23 @@
 
 #ifdef __cforall
-	// forall(otype T | sized(T))
+	// forall(T | sized(T))
 	// static inline void ?{}(__small_array(T) & this) {}
 
-	forall(dtype T | sized(T))
+	forall(T & | sized(T))
 	static inline T & ?[?]( __small_array(T) & this, __lock_size_t idx ) {
 		return ((typeof(this.data))this.data)[idx];
 	}
 
-	forall(dtype T | sized(T))
+	forall(T & | sized(T))
 	static inline T & ?[?]( const __small_array(T) & this, __lock_size_t idx ) {
 		return ((typeof(this.data))this.data)[idx];
 	}
 
-	forall(dtype T)
+	forall(T &)
 	static inline T * begin( const __small_array(T) & this ) {
 		return ((typeof(this.data))this.data);
 	}
 
-	forall(dtype T | sized(T))
+	forall(T & | sized(T))
 	static inline T * end( const __small_array(T) & this ) {
 		return ((typeof(this.data))this.data) + this.size;
@@ -69,5 +69,5 @@
 
 #ifdef __cforall
-	trait is_node(dtype T) {
+	trait is_node(T &) {
 		T *& get_next( T & );
 	};
@@ -78,5 +78,5 @@
 //-----------------------------------------------------------------------------
 #ifdef __cforall
-	forall(dtype TYPE)
+	forall(TYPE &)
 	#define T TYPE
 #else
@@ -95,10 +95,10 @@
 
 #ifdef __cforall
-	forall(dtype T)
+	forall(T &)
 	static inline void ?{}( __stack(T) & this ) {
 		(this.top){ 0p };
 	}
 
-	static inline forall( dtype T | is_node(T) ) {
+	static inline forall( T & | is_node(T) ) {
 		void push( __stack(T) & this, T * val ) {
 			verify( !get_next( *val ) );
@@ -126,5 +126,5 @@
 //-----------------------------------------------------------------------------
 #ifdef __cforall
-	forall(dtype TYPE)
+	forall(TYPE &)
 	#define T TYPE
 #else
@@ -144,5 +144,5 @@
 
 #ifdef __cforall
-	static inline forall( dtype T | is_node(T) ) {
+	static inline forall( T & | is_node(T) ) {
 		void ?{}( __queue(T) & this ) with( this ) {
 			(this.head){ 1p };
@@ -215,5 +215,5 @@
 //-----------------------------------------------------------------------------
 #ifdef __cforall
-	forall(dtype TYPE)
+	forall(TYPE &)
 	#define T TYPE
 	#define __getter_t * [T * & next, T * & prev] ( T & )
@@ -237,5 +237,5 @@
 
 #ifdef __cforall
-	forall(dtype T )
+	forall(T & )
 	static inline [void] ?{}( __dllist(T) & this, * [T * & next, T * & prev] ( T & ) __get ) {
 		(this.head){ 0p };
@@ -245,5 +245,5 @@
 	#define next 0
 	#define prev 1
-	static inline forall(dtype T) {
+	static inline forall(T &) {
 		void push_front( __dllist(T) & this, T & node ) with( this ) {
 			verify(__get);
Index: libcfa/src/bits/defs.hfa
===================================================================
--- libcfa/src/bits/defs.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bits/defs.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,5 +5,8 @@
 // file "LICENCE" distributed with Cforall.
 //
-// defs.hfa --
+// defs.hfa -- Commen macros, functions and typedefs
+// Most files depend on them and they are always useful to have.
+//
+//  *** Must not contain code specific to libcfathread ***
 //
 // Author           : Thierry Delisle
@@ -62,2 +65,11 @@
 	#endif
 }
+
+// pause to prevent excess processor bus usage
+#if defined( __i386 ) || defined( __x86_64 )
+	#define Pause() __asm__ __volatile__ ( "pause" : : : )
+#elif defined( __ARM_ARCH )
+	#define Pause() __asm__ __volatile__ ( "YIELD" : : : )
+#else
+	#error unsupported architecture
+#endif
Index: libcfa/src/bits/locks.hfa
===================================================================
--- libcfa/src/bits/locks.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bits/locks.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,5 +5,8 @@
 // file "LICENCE" distributed with Cforall.
 //
-// bits/locks.hfa -- Fast internal locks.
+// bits/locks.hfa -- Basic spinlocks that are reused in the system.
+// Used for locks that aren't specific to cforall threads and can be used anywhere
+//
+//  *** Must not contain code specific to libcfathread ***
 //
 // Author           : Thierry Delisle
@@ -19,19 +22,4 @@
 #include "bits/defs.hfa"
 #include <assert.h>
-
-#ifdef __cforall
-	extern "C" {
-		#include <pthread.h>
-	}
-#endif
-
-// pause to prevent excess processor bus usage
-#if defined( __i386 ) || defined( __x86_64 )
-	#define Pause() __asm__ __volatile__ ( "pause" : : : )
-#elif defined( __ARM_ARCH )
-	#define Pause() __asm__ __volatile__ ( "YIELD" : : : )
-#else
-	#error unsupported architecture
-#endif
 
 struct __spinlock_t {
@@ -104,320 +92,3 @@
 		enable_interrupts_noPoll();
 	}
-
-
-	#ifdef __CFA_WITH_VERIFY__
-		extern bool __cfaabi_dbg_in_kernel();
-	#endif
-
-	extern "C" {
-		char * strerror(int);
-	}
-	#define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
-
-	struct __bin_sem_t {
-		pthread_mutex_t 	lock;
-		pthread_cond_t  	cond;
-		int     		val;
-	};
-
-	static inline void ?{}(__bin_sem_t & this) with( this ) {
-		// Create the mutex with error checking
-		pthread_mutexattr_t mattr;
-		pthread_mutexattr_init( &mattr );
-		pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
-		pthread_mutex_init(&lock, &mattr);
-
-		pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
-		val = 0;
-	}
-
-	static inline void ^?{}(__bin_sem_t & this) with( this ) {
-		CHECKED( pthread_mutex_destroy(&lock) );
-		CHECKED( pthread_cond_destroy (&cond) );
-	}
-
-	static inline void wait(__bin_sem_t & this) with( this ) {
-		verify(__cfaabi_dbg_in_kernel());
-		CHECKED( pthread_mutex_lock(&lock) );
-			while(val < 1) {
-				pthread_cond_wait(&cond, &lock);
-			}
-			val -= 1;
-		CHECKED( pthread_mutex_unlock(&lock) );
-	}
-
-	static inline bool post(__bin_sem_t & this) with( this ) {
-		bool needs_signal = false;
-
-		CHECKED( pthread_mutex_lock(&lock) );
-			if(val < 1) {
-				val += 1;
-				pthread_cond_signal(&cond);
-				needs_signal = true;
-			}
-		CHECKED( pthread_mutex_unlock(&lock) );
-
-		return needs_signal;
-	}
-
-	#undef CHECKED
-
-	struct $thread;
-	extern void park( void );
-	extern void unpark( struct $thread * this );
-	static inline struct $thread * active_thread ();
-
-	// Semaphore which only supports a single thread
-	struct single_sem {
-		struct $thread * volatile ptr;
-	};
-
-	static inline {
-		void  ?{}(single_sem & this) {
-			this.ptr = 0p;
-		}
-
-		void ^?{}(single_sem &) {}
-
-		bool wait(single_sem & this) {
-			for() {
-				struct $thread * expected = this.ptr;
-				if(expected == 1p) {
-					if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-						return false;
-					}
-				}
-				else {
-					/* paranoid */ verify( expected == 0p );
-					if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-						park();
-						return true;
-					}
-				}
-
-			}
-		}
-
-		bool post(single_sem & this) {
-			for() {
-				struct $thread * expected = this.ptr;
-				if(expected == 1p) return false;
-				if(expected == 0p) {
-					if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-						return false;
-					}
-				}
-				else {
-					if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-						unpark( expected );
-						return true;
-					}
-				}
-			}
-		}
-	}
-
-	// Synchronozation primitive which only supports a single thread and one post
-	// Similar to a binary semaphore with a 'one shot' semantic
-	// is expected to be discarded after each party call their side
-	struct oneshot {
-		// Internal state :
-		//     0p     : is initial state (wait will block)
-		//     1p     : fulfilled (wait won't block)
-		// any thread : a thread is currently waiting
-		struct $thread * volatile ptr;
-	};
-
-	static inline {
-		void  ?{}(oneshot & this) {
-			this.ptr = 0p;
-		}
-
-		void ^?{}(oneshot &) {}
-
-		// Wait for the post, return immidiately if it already happened.
-		// return true if the thread was parked
-		bool wait(oneshot & this) {
-			for() {
-				struct $thread * expected = this.ptr;
-				if(expected == 1p) return false;
-				/* paranoid */ verify( expected == 0p );
-				if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-					park();
-					/* paranoid */ verify( this.ptr == 1p );
-					return true;
-				}
-			}
-		}
-
-		// Mark as fulfilled, wake thread if needed
-		// return true if a thread was unparked
-		bool post(oneshot & this) {
-			struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
-			if( got == 0p ) return false;
-			unpark( got );
-			return true;
-		}
-	}
-
-	// base types for future to build upon
-	// It is based on the 'oneshot' type to allow multiple futures
-	// to block on the same instance, permitting users to block a single
-	// thread on "any of" [a given set of] futures.
-	// does not support multiple threads waiting on the same future
-	struct future_t {
-		// Internal state :
-		//     0p      : is initial state (wait will block)
-		//     1p      : fulfilled (wait won't block)
-		//     2p      : in progress ()
-		//     3p      : abandoned, server should delete
-		// any oneshot : a context has been setup to wait, a thread could wait on it
-		struct oneshot * volatile ptr;
-	};
-
-	static inline {
-		void  ?{}(future_t & this) {
-			this.ptr = 0p;
-		}
-
-		void ^?{}(future_t &) {}
-
-		void reset(future_t & this) {
-			// needs to be in 0p or 1p
-			__atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
-		}
-
-		// check if the future is available
-		bool available( future_t & this ) {
-			return this.ptr == 1p;
-		}
-
-		// Prepare the future to be waited on
-		// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
-		bool setup( future_t & this, oneshot & wait_ctx ) {
-			/* paranoid */ verify( wait_ctx.ptr == 0p );
-			// The future needs to set the wait context
-			for() {
-				struct oneshot * expected = this.ptr;
-				// Is the future already fulfilled?
-				if(expected == 1p) return false; // Yes, just return false (didn't block)
-
-				// The future is not fulfilled, try to setup the wait context
-				/* paranoid */ verify( expected == 0p );
-				if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-					return true;
-				}
-			}
-		}
-
-		// Stop waiting on a future
-		// When multiple futures are waited for together in "any of" pattern
-		// futures that weren't fulfilled before the thread woke up
-		// should retract the wait ctx
-		// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
-		void retract( future_t & this, oneshot & wait_ctx ) {
-			// Remove the wait context
-			struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
-
-			// got == 0p: future was never actually setup, just return
-			if( got == 0p ) return;
-
-			// got == wait_ctx: since fulfil does an atomic_swap,
-			// if we got back the original then no one else saw context
-			// It is safe to delete (which could happen after the return)
-			if( got == &wait_ctx ) return;
-
-			// got == 1p: the future is ready and the context was fully consumed
-			// the server won't use the pointer again
-			// It is safe to delete (which could happen after the return)
-			if( got == 1p ) return;
-
-			// got == 2p: the future is ready but the context hasn't fully been consumed
-			// spin until it is safe to move on
-			if( got == 2p ) {
-				while( this.ptr != 1p ) Pause();
-				return;
-			}
-
-			// got == any thing else, something wen't wrong here, abort
-			abort("Future in unexpected state");
-		}
-
-		// Mark the future as abandoned, meaning it will be deleted by the server
-		bool abandon( future_t & this ) {
-			/* paranoid */ verify( this.ptr != 3p );
-
-			// Mark the future as abandonned
-			struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
-
-			// If the future isn't already fulfilled, let the server delete it
-			if( got == 0p ) return false;
-
-			// got == 2p: the future is ready but the context hasn't fully been consumed
-			// spin until it is safe to move on
-			if( got == 2p ) {
-				while( this.ptr != 1p ) Pause();
-				got = 1p;
-			}
-
-			// The future is completed delete it now
-			/* paranoid */ verify( this.ptr != 1p );
-			free( &this );
-			return true;
-		}
-
-		// from the server side, mark the future as fulfilled
-		// delete it if needed
-		bool fulfil( future_t & this ) {
-			for() {
-				struct oneshot * expected = this.ptr;
-				// was this abandoned?
-				#if defined(__GNUC__) && __GNUC__ >= 7
-					#pragma GCC diagnostic push
-					#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
-				#endif
-					if( expected == 3p ) { free( &this ); return false; }
-				#if defined(__GNUC__) && __GNUC__ >= 7
-					#pragma GCC diagnostic pop
-				#endif
-
-				/* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
-				/* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
-
-				// If there is a wait context, we need to consume it and mark it as consumed after
-				// If there is no context then we can skip the in progress phase
-				struct oneshot * want = expected == 0p ? 1p : 2p;
-				if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
-					if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
-					bool ret = post( *expected );
-					__atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
-					return ret;
-				}
-			}
-
-		}
-
-		// Wait for the future to be fulfilled
-		bool wait( future_t & this ) {
-			oneshot temp;
-			if( !setup(this, temp) ) return false;
-
-			// Wait context is setup, just wait on it
-			bool ret = wait( temp );
-
-			// Wait for the future to tru
-			while( this.ptr == 2p ) Pause();
-			// Make sure the state makes sense
-			// Should be fulfilled, could be in progress but it's out of date if so
-			// since if that is the case, the oneshot was fulfilled (unparking this thread)
-			// and the oneshot should not be needed any more
-			__attribute__((unused)) struct oneshot * was = this.ptr;
-			/* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
-
-			// Mark the future as fulfilled, to be consistent
-			// with potential calls to avail
-			// this.ptr = 1p;
-			return ret;
-		}
-	}
 #endif
Index: libcfa/src/bits/queue.hfa
===================================================================
--- libcfa/src/bits/queue.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bits/queue.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -9,5 +9,5 @@
 // instead of being null.
 
-forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
 	struct Queue {
 		inline Collection;								// Plan 9 inheritance
@@ -151,5 +151,5 @@
 } // distribution
 
-forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
 	struct QueueIter {
 		inline ColIter;									// Plan 9 inheritance
Index: libcfa/src/bits/sequence.hfa
===================================================================
--- libcfa/src/bits/sequence.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bits/sequence.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -29,5 +29,5 @@
 
 	// // wrappers to make Collection have T
-	// forall( dtype T ) {
+	// forall( T & ) {
 	// 	T *& Back( T * n ) {
 	// 		return (T *)Back( (Seqable *)n );
@@ -43,5 +43,5 @@
 // and the back field of the last node points at the first node (circular).
 
-forall( dtype T | { T *& Back ( T * ); T *& Next ( T * ); } ) {
+forall( T & | { T *& Back ( T * ); T *& Next ( T * ); } ) {
 	struct Sequence {
 		inline Collection;								// Plan 9 inheritance
@@ -231,5 +231,5 @@
 } // distribution
 
-forall( dtype T | { T *& Back ( T * ); T *& Next ( T * ); } ) {
+forall( T & | { T *& Back ( T * ); T *& Next ( T * ); } ) {
 	// SeqIter(T) is used to iterate over a Sequence(T) in head-to-tail order.
 	struct SeqIter {
Index: libcfa/src/bits/stack.hfa
===================================================================
--- libcfa/src/bits/stack.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/bits/stack.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -9,5 +9,5 @@
 // instead of being null.
 
-forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
 	struct Stack {
 		inline Collection;								// Plan 9 inheritance
@@ -67,5 +67,5 @@
 // order returned by drop().
 
-forall( dtype T | { T *& Next ( T * ); } ) {
+forall( T & | { T *& Next ( T * ); } ) {
 	struct StackIter {
 		inline ColIter;									// Plan 9 inheritance
Index: libcfa/src/common.cfa
===================================================================
--- libcfa/src/common.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/common.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -23,5 +23,5 @@
 [ long int, long int ] div( long int num, long int denom ) { ldiv_t qr = ldiv( num, denom ); return [ qr.quot, qr.rem ]; }
 [ long long int, long long int ] div( long long int num, long long int denom ) { lldiv_t qr = lldiv( num, denom ); return [ qr.quot, qr.rem ]; }
-forall( otype T | { T ?/?( T, T ); T ?%?( T, T ); } )
+forall( T | { T ?/?( T, T ); T ?%?( T, T ); } )
 [ T, T ] div( T num, T denom ) { return [ num / denom, num % denom ]; }
 
Index: libcfa/src/common.hfa
===================================================================
--- libcfa/src/common.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/common.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -21,5 +21,5 @@
 [ long int, long int ] div( long int num, long int denom );
 [ long long int, long long int ] div( long long int num, long long int denom );
-forall( otype T | { T ?/?( T, T ); T ?%?( T, T ); } )
+forall( T | { T ?/?( T, T ); T ?%?( T, T ); } )
 [ T, T ] div( T num, T demon );
 
@@ -61,5 +61,5 @@
 } // distribution
 
-forall( otype T | { void ?{}( T &, zero_t ); int ?<?( T, T ); T -?( T ); } )
+forall( T | { void ?{}( T &, zero_t ); int ?<?( T, T ); T -?( T ); } )
 T abs( T );
 
@@ -70,5 +70,5 @@
 	intptr_t min( intptr_t t1, intptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
 	uintptr_t min( uintptr_t t1, uintptr_t t2 ) { return t1 < t2 ? t1 : t2; } // optimization
-	forall( otype T | { int ?<?( T, T ); } )
+	forall( T | { int ?<?( T, T ); } )
 	T min( T t1, T t2 ) { return t1 < t2 ? t1 : t2; }
 
@@ -76,11 +76,11 @@
 	intptr_t max( intptr_t t1, intptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
 	uintptr_t max( uintptr_t t1, uintptr_t t2 ) { return t1 > t2 ? t1 : t2; } // optimization
-	forall( otype T | { int ?>?( T, T ); } )
+	forall( T | { int ?>?( T, T ); } )
 	T max( T t1, T t2 ) { return t1 > t2 ? t1 : t2; }
 
-	forall( otype T | { T min( T, T ); T max( T, T ); } )
+	forall( T | { T min( T, T ); T max( T, T ); } )
 	T clamp( T value, T min_val, T max_val ) { return max( min_val, min( value, max_val ) ); }
 
-	forall( otype T )
+	forall( T )
 	void swap( T & v1, T & v2 ) { T temp = v1; v1 = v2; v2 = temp; }
 } // distribution
Index: libcfa/src/concurrency/coroutine.cfa
===================================================================
--- libcfa/src/concurrency/coroutine.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/coroutine.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -46,10 +46,10 @@
 
 //-----------------------------------------------------------------------------
-FORALL_DATA_INSTANCE(CoroutineCancelled, (dtype coroutine_t), (coroutine_t))
-
-forall(dtype T)
+FORALL_DATA_INSTANCE(CoroutineCancelled, (coroutine_t &), (coroutine_t))
+
+forall(T &)
 void mark_exception(CoroutineCancelled(T) *) {}
 
-forall(dtype T)
+forall(T &)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src) {
 	dst->virtual_table = src->virtual_table;
@@ -58,5 +58,5 @@
 }
 
-forall(dtype T)
+forall(T &)
 const char * msg(CoroutineCancelled(T) *) {
 	return "CoroutineCancelled(...)";
@@ -64,5 +64,5 @@
 
 // This code should not be inlined. It is the error path on resume.
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc ) {
 	verify( desc->cancellation );
@@ -148,5 +148,5 @@
 // Part of the Public API
 // Not inline since only ever called once per coroutine
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void prime(T& cor) {
 	$coroutine* this = get_coroutine(cor);
Index: libcfa/src/concurrency/coroutine.hfa
===================================================================
--- libcfa/src/concurrency/coroutine.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/coroutine.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -22,13 +22,13 @@
 //-----------------------------------------------------------------------------
 // Exception thrown from resume when a coroutine stack is cancelled.
-FORALL_DATA_EXCEPTION(CoroutineCancelled, (dtype coroutine_t), (coroutine_t)) (
+FORALL_DATA_EXCEPTION(CoroutineCancelled, (coroutine_t &), (coroutine_t)) (
 	coroutine_t * the_coroutine;
 	exception_t * the_exception;
 );
 
-forall(dtype T)
+forall(T &)
 void copy(CoroutineCancelled(T) * dst, CoroutineCancelled(T) * src);
 
-forall(dtype T)
+forall(T &)
 const char * msg(CoroutineCancelled(T) *);
 
@@ -37,5 +37,5 @@
 // Anything that implements this trait can be resumed.
 // Anything that is resumed is a coroutine.
-trait is_coroutine(dtype T | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
+trait is_coroutine(T & | IS_RESUMPTION_EXCEPTION(CoroutineCancelled, (T))) {
 	void main(T & this);
 	$coroutine * get_coroutine(T & this);
@@ -60,5 +60,5 @@
 //-----------------------------------------------------------------------------
 // Public coroutine API
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void prime(T & cor);
 
@@ -72,5 +72,5 @@
 	void __cfactx_invoke_coroutine(void (*main)(void *), void * this);
 
-	forall(dtype T)
+	forall(T &)
 	void __cfactx_start(void (*main)(T &), struct $coroutine * cor, T & this, void (*invoke)(void (*main)(void *), void *));
 
@@ -129,9 +129,9 @@
 }
 
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 void __cfaehm_cancelled_coroutine( T & cor, $coroutine * desc );
 
 // Resume implementation inlined for performance
-forall(dtype T | is_coroutine(T))
+forall(T & | is_coroutine(T))
 static inline T & resume(T & cor) {
 	// optimization : read TLS once and reuse it
Index: libcfa/src/concurrency/future.hfa
===================================================================
--- libcfa/src/concurrency/future.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/future.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -19,5 +19,5 @@
 #include "monitor.hfa"
 
-forall( otype T ) {
+forall( T ) {
 	struct future {
 		inline future_t;
@@ -58,5 +58,5 @@
 }
 
-forall( otype T ) {
+forall( T ) {
 	monitor multi_future {
 		inline future_t;
Index: libcfa/src/concurrency/io/types.hfa
===================================================================
--- libcfa/src/concurrency/io/types.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/io/types.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,5 +5,6 @@
 // file "LICENCE" distributed with Cforall.
 //
-// io/types.hfa --
+// io/types.hfa -- PRIVATE
+// Types used by the I/O subsystem
 //
 // Author           : Thierry Delisle
@@ -21,4 +22,5 @@
 
 #include "bits/locks.hfa"
+#include "kernel/fwd.hfa"
 
 #if defined(CFA_HAVE_LINUX_IO_URING_H)
Index: libcfa/src/concurrency/kernel.cfa
===================================================================
--- libcfa/src/concurrency/kernel.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/kernel.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -224,5 +224,5 @@
 	}
 
-	V( this->terminated );
+	post( this->terminated );
 
 	if(this == mainProcessor) {
@@ -624,30 +624,6 @@
 // Unexpected Terminating logic
 //=============================================================================================
-
-extern "C" {
-	extern void __cfaabi_real_abort(void);
-}
-static volatile bool kernel_abort_called = false;
-
-void * kernel_abort(void) __attribute__ ((__nothrow__)) {
-	// abort cannot be recursively entered by the same or different processors because all signal handlers return when
-	// the globalAbort flag is true.
-	bool first = !__atomic_test_and_set( &kernel_abort_called, __ATOMIC_SEQ_CST);
-
-	// first task to abort ?
-	if ( !first ) {
-		// We aren't the first to abort.
-		// I give up, just let C handle it
-		__cfaabi_real_abort();
-	}
-
-	// disable interrupts, it no longer makes sense to try to interrupt this processor
-	disable_interrupts();
-
-	return __cfaabi_tls.this_thread;
-}
-
-void kernel_abort_msg( void * kernel_data, char * abort_text, int abort_text_size ) {
-	$thread * thrd = ( $thread * ) kernel_data;
+void __kernel_abort_msg( char * abort_text, int abort_text_size ) {
+	$thread * thrd = __cfaabi_tls.this_thread;
 
 	if(thrd) {
@@ -669,6 +645,6 @@
 }
 
-int kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
-	return get_coroutine(kernelTLS().this_thread) == get_coroutine(mainThread) ? 4 : 2;
+int __kernel_abort_lastframe( void ) __attribute__ ((__nothrow__)) {
+	return get_coroutine(__cfaabi_tls.this_thread) == get_coroutine(mainThread) ? 4 : 2;
 }
 
@@ -688,62 +664,4 @@
 // Kernel Utilities
 //=============================================================================================
-//-----------------------------------------------------------------------------
-// Locks
-void  ?{}( semaphore & this, int count = 1 ) {
-	(this.lock){};
-	this.count = count;
-	(this.waiting){};
-}
-void ^?{}(semaphore & this) {}
-
-bool P(semaphore & this) with( this ){
-	lock( lock __cfaabi_dbg_ctx2 );
-	count -= 1;
-	if ( count < 0 ) {
-		// queue current task
-		append( waiting, active_thread() );
-
-		// atomically release spin lock and block
-		unlock( lock );
-		park();
-		return true;
-	}
-	else {
-	    unlock( lock );
-	    return false;
-	}
-}
-
-bool V(semaphore & this) with( this ) {
-	$thread * thrd = 0p;
-	lock( lock __cfaabi_dbg_ctx2 );
-	count += 1;
-	if ( count <= 0 ) {
-		// remove task at head of waiting list
-		thrd = pop_head( waiting );
-	}
-
-	unlock( lock );
-
-	// make new owner
-	unpark( thrd );
-
-	return thrd != 0p;
-}
-
-bool V(semaphore & this, unsigned diff) with( this ) {
-	$thread * thrd = 0p;
-	lock( lock __cfaabi_dbg_ctx2 );
-	int release = max(-count, (int)diff);
-	count += diff;
-	for(release) {
-		unpark( pop_head( waiting ) );
-	}
-
-	unlock( lock );
-
-	return thrd != 0p;
-}
-
 //-----------------------------------------------------------------------------
 // Debug
Index: libcfa/src/concurrency/kernel.hfa
===================================================================
--- libcfa/src/concurrency/kernel.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/kernel.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,5 +5,5 @@
 // file "LICENCE" distributed with Cforall.
 //
-// kernel --
+// kernel -- Header containing the core of the kernel API
 //
 // Author           : Thierry Delisle
@@ -24,20 +24,66 @@
 extern "C" {
 	#include <bits/pthreadtypes.h>
+	#include <pthread.h>
 	#include <linux/types.h>
 }
 
 //-----------------------------------------------------------------------------
-// Locks
-struct semaphore {
-	__spinlock_t lock;
-	int count;
-	__queue_t($thread) waiting;
-};
-
-void  ?{}(semaphore & this, int count = 1);
-void ^?{}(semaphore & this);
-bool   P (semaphore & this);
-bool   V (semaphore & this);
-bool   V (semaphore & this, unsigned count);
+// Underlying Locks
+#ifdef __CFA_WITH_VERIFY__
+	extern bool __cfaabi_dbg_in_kernel();
+#endif
+
+extern "C" {
+	char * strerror(int);
+}
+#define CHECKED(x) { int err = x; if( err != 0 ) abort("KERNEL ERROR: Operation \"" #x "\" return error %d - %s\n", err, strerror(err)); }
+
+struct __bin_sem_t {
+	pthread_mutex_t 	lock;
+	pthread_cond_t  	cond;
+	int     		val;
+};
+
+static inline void ?{}(__bin_sem_t & this) with( this ) {
+	// Create the mutex with error checking
+	pthread_mutexattr_t mattr;
+	pthread_mutexattr_init( &mattr );
+	pthread_mutexattr_settype( &mattr, PTHREAD_MUTEX_ERRORCHECK_NP);
+	pthread_mutex_init(&lock, &mattr);
+
+	pthread_cond_init (&cond, (const pthread_condattr_t *)0p);  // workaround trac#208: cast should not be required
+	val = 0;
+}
+
+static inline void ^?{}(__bin_sem_t & this) with( this ) {
+	CHECKED( pthread_mutex_destroy(&lock) );
+	CHECKED( pthread_cond_destroy (&cond) );
+}
+
+static inline void wait(__bin_sem_t & this) with( this ) {
+	verify(__cfaabi_dbg_in_kernel());
+	CHECKED( pthread_mutex_lock(&lock) );
+		while(val < 1) {
+			pthread_cond_wait(&cond, &lock);
+		}
+		val -= 1;
+	CHECKED( pthread_mutex_unlock(&lock) );
+}
+
+static inline bool post(__bin_sem_t & this) with( this ) {
+	bool needs_signal = false;
+
+	CHECKED( pthread_mutex_lock(&lock) );
+		if(val < 1) {
+			val += 1;
+			pthread_cond_signal(&cond);
+			needs_signal = true;
+		}
+	CHECKED( pthread_mutex_unlock(&lock) );
+
+	return needs_signal;
+}
+
+#undef CHECKED
 
 
@@ -91,5 +137,5 @@
 
 	// Termination synchronisation (user semaphore)
-	semaphore terminated;
+	oneshot terminated;
 
 	// pthread Stack
Index: libcfa/src/concurrency/kernel/fwd.hfa
===================================================================
--- libcfa/src/concurrency/kernel/fwd.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/kernel/fwd.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,5 +5,6 @@
 // file "LICENCE" distributed with Cforall.
 //
-// kernel/fwd.hfa --
+// kernel/fwd.hfa -- PUBLIC
+// Fundamental code needed to implement threading M.E.S. algorithms.
 //
 // Author           : Thierry Delisle
@@ -134,4 +135,258 @@
 		extern uint64_t thread_rand();
 
+		// Semaphore which only supports a single thread
+		struct single_sem {
+			struct $thread * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(single_sem & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(single_sem &) {}
+
+			bool wait(single_sem & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							return false;
+						}
+					}
+					else {
+						/* paranoid */ verify( expected == 0p );
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							park();
+							return true;
+						}
+					}
+
+				}
+			}
+
+			bool post(single_sem & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) return false;
+					if(expected == 0p) {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 1p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							return false;
+						}
+					}
+					else {
+						if(__atomic_compare_exchange_n(&this.ptr, &expected, 0p, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+							unpark( expected );
+							return true;
+						}
+					}
+				}
+			}
+		}
+
+		// Synchronozation primitive which only supports a single thread and one post
+		// Similar to a binary semaphore with a 'one shot' semantic
+		// is expected to be discarded after each party call their side
+		struct oneshot {
+			// Internal state :
+			//     0p     : is initial state (wait will block)
+			//     1p     : fulfilled (wait won't block)
+			// any thread : a thread is currently waiting
+			struct $thread * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(oneshot & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(oneshot &) {}
+
+			// Wait for the post, return immidiately if it already happened.
+			// return true if the thread was parked
+			bool wait(oneshot & this) {
+				for() {
+					struct $thread * expected = this.ptr;
+					if(expected == 1p) return false;
+					/* paranoid */ verify( expected == 0p );
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, active_thread(), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						park();
+						/* paranoid */ verify( this.ptr == 1p );
+						return true;
+					}
+				}
+			}
+
+			// Mark as fulfilled, wake thread if needed
+			// return true if a thread was unparked
+			bool post(oneshot & this) {
+				struct $thread * got = __atomic_exchange_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+				if( got == 0p ) return false;
+				unpark( got );
+				return true;
+			}
+		}
+
+		// base types for future to build upon
+		// It is based on the 'oneshot' type to allow multiple futures
+		// to block on the same instance, permitting users to block a single
+		// thread on "any of" [a given set of] futures.
+		// does not support multiple threads waiting on the same future
+		struct future_t {
+			// Internal state :
+			//     0p      : is initial state (wait will block)
+			//     1p      : fulfilled (wait won't block)
+			//     2p      : in progress ()
+			//     3p      : abandoned, server should delete
+			// any oneshot : a context has been setup to wait, a thread could wait on it
+			struct oneshot * volatile ptr;
+		};
+
+		static inline {
+			void  ?{}(future_t & this) {
+				this.ptr = 0p;
+			}
+
+			void ^?{}(future_t &) {}
+
+			void reset(future_t & this) {
+				// needs to be in 0p or 1p
+				__atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+			}
+
+			// check if the future is available
+			bool available( future_t & this ) {
+				return this.ptr == 1p;
+			}
+
+			// Prepare the future to be waited on
+			// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+			bool setup( future_t & this, oneshot & wait_ctx ) {
+				/* paranoid */ verify( wait_ctx.ptr == 0p );
+				// The future needs to set the wait context
+				for() {
+					struct oneshot * expected = this.ptr;
+					// Is the future already fulfilled?
+					if(expected == 1p) return false; // Yes, just return false (didn't block)
+
+					// The future is not fulfilled, try to setup the wait context
+					/* paranoid */ verify( expected == 0p );
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, &wait_ctx, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						return true;
+					}
+				}
+			}
+
+			// Stop waiting on a future
+			// When multiple futures are waited for together in "any of" pattern
+			// futures that weren't fulfilled before the thread woke up
+			// should retract the wait ctx
+			// intented to be use by wait, wait_any, waitfor, etc. rather than used directly
+			void retract( future_t & this, oneshot & wait_ctx ) {
+				// Remove the wait context
+				struct oneshot * got = __atomic_exchange_n( &this.ptr, 0p, __ATOMIC_SEQ_CST);
+
+				// got == 0p: future was never actually setup, just return
+				if( got == 0p ) return;
+
+				// got == wait_ctx: since fulfil does an atomic_swap,
+				// if we got back the original then no one else saw context
+				// It is safe to delete (which could happen after the return)
+				if( got == &wait_ctx ) return;
+
+				// got == 1p: the future is ready and the context was fully consumed
+				// the server won't use the pointer again
+				// It is safe to delete (which could happen after the return)
+				if( got == 1p ) return;
+
+				// got == 2p: the future is ready but the context hasn't fully been consumed
+				// spin until it is safe to move on
+				if( got == 2p ) {
+					while( this.ptr != 1p ) Pause();
+					return;
+				}
+
+				// got == any thing else, something wen't wrong here, abort
+				abort("Future in unexpected state");
+			}
+
+			// Mark the future as abandoned, meaning it will be deleted by the server
+			bool abandon( future_t & this ) {
+				/* paranoid */ verify( this.ptr != 3p );
+
+				// Mark the future as abandonned
+				struct oneshot * got = __atomic_exchange_n( &this.ptr, 3p, __ATOMIC_SEQ_CST);
+
+				// If the future isn't already fulfilled, let the server delete it
+				if( got == 0p ) return false;
+
+				// got == 2p: the future is ready but the context hasn't fully been consumed
+				// spin until it is safe to move on
+				if( got == 2p ) {
+					while( this.ptr != 1p ) Pause();
+					got = 1p;
+				}
+
+				// The future is completed delete it now
+				/* paranoid */ verify( this.ptr != 1p );
+				free( &this );
+				return true;
+			}
+
+			// from the server side, mark the future as fulfilled
+			// delete it if needed
+			bool fulfil( future_t & this ) {
+				for() {
+					struct oneshot * expected = this.ptr;
+					// was this abandoned?
+					#if defined(__GNUC__) && __GNUC__ >= 7
+						#pragma GCC diagnostic push
+						#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+					#endif
+						if( expected == 3p ) { free( &this ); return false; }
+					#if defined(__GNUC__) && __GNUC__ >= 7
+						#pragma GCC diagnostic pop
+					#endif
+
+					/* paranoid */ verify( expected != 1p ); // Future is already fulfilled, should not happen
+					/* paranoid */ verify( expected != 2p ); // Future is bein fulfilled by someone else, this is even less supported then the previous case.
+
+					// If there is a wait context, we need to consume it and mark it as consumed after
+					// If there is no context then we can skip the in progress phase
+					struct oneshot * want = expected == 0p ? 1p : 2p;
+					if(__atomic_compare_exchange_n(&this.ptr, &expected, want, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
+						if( expected == 0p ) { /* paranoid */ verify( this.ptr == 1p); return false; }
+						bool ret = post( *expected );
+						__atomic_store_n( &this.ptr, 1p, __ATOMIC_SEQ_CST);
+						return ret;
+					}
+				}
+
+			}
+
+			// Wait for the future to be fulfilled
+			bool wait( future_t & this ) {
+				oneshot temp;
+				if( !setup(this, temp) ) return false;
+
+				// Wait context is setup, just wait on it
+				bool ret = wait( temp );
+
+				// Wait for the future to tru
+				while( this.ptr == 2p ) Pause();
+				// Make sure the state makes sense
+				// Should be fulfilled, could be in progress but it's out of date if so
+				// since if that is the case, the oneshot was fulfilled (unparking this thread)
+				// and the oneshot should not be needed any more
+				__attribute__((unused)) struct oneshot * was = this.ptr;
+				/* paranoid */ verifyf( was == 1p, "Expected this.ptr to be 1p, was %p\n", was );
+
+				// Mark the future as fulfilled, to be consistent
+				// with potential calls to avail
+				// this.ptr = 1p;
+				return ret;
+			}
+		}
+
 		//-----------------------------------------------------------------------
 		// Statics call at the end of each thread to register statistics
Index: libcfa/src/concurrency/kernel/startup.cfa
===================================================================
--- libcfa/src/concurrency/kernel/startup.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/kernel/startup.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -199,5 +199,5 @@
 	void ?{}(processor & this) with( this ) {
 		( this.idle ){};
-		( this.terminated ){ 0 };
+		( this.terminated ){};
 		( this.runner ){};
 		init( this, "Main Processor", *mainCluster );
@@ -528,5 +528,5 @@
 void ?{}(processor & this, const char name[], cluster & _cltr) {
 	( this.idle ){};
-	( this.terminated ){ 0 };
+	( this.terminated ){};
 	( this.runner ){};
 
@@ -549,5 +549,5 @@
 		__wake_proc( &this );
 
-		P( terminated );
+		wait( terminated );
 		/* paranoid */ verify( active_processor() != &this);
 	}
Index: libcfa/src/concurrency/locks.cfa
===================================================================
--- libcfa/src/concurrency/locks.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/locks.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -7,5 +7,5 @@
 //-----------------------------------------------------------------------------
 // info_thread
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 	struct info_thread {
 		// used to put info_thread on a dl queue (aka sequence)
@@ -195,5 +195,5 @@
 //-----------------------------------------------------------------------------
 // alarm node wrapper
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 	struct alarm_node_wrap {
 		alarm_node_t alarm_node;
@@ -239,5 +239,5 @@
 //-----------------------------------------------------------------------------
 // condition variable
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 
 	void ?{}( condition_variable(L) & this ){
@@ -356,2 +356,60 @@
 	bool wait( condition_variable(L) & this, L & l, uintptr_t info, Time time         ) with(this) { WAIT_TIME( info, &l , time ) }
 }
+
+//-----------------------------------------------------------------------------
+// Semaphore
+void  ?{}( semaphore & this, int count = 1 ) {
+	(this.lock){};
+	this.count = count;
+	(this.waiting){};
+}
+void ^?{}(semaphore & this) {}
+
+bool P(semaphore & this) with( this ){
+	lock( lock __cfaabi_dbg_ctx2 );
+	count -= 1;
+	if ( count < 0 ) {
+		// queue current task
+		append( waiting, active_thread() );
+
+		// atomically release spin lock and block
+		unlock( lock );
+		park();
+		return true;
+	}
+	else {
+	    unlock( lock );
+	    return false;
+	}
+}
+
+bool V(semaphore & this) with( this ) {
+	$thread * thrd = 0p;
+	lock( lock __cfaabi_dbg_ctx2 );
+	count += 1;
+	if ( count <= 0 ) {
+		// remove task at head of waiting list
+		thrd = pop_head( waiting );
+	}
+
+	unlock( lock );
+
+	// make new owner
+	unpark( thrd );
+
+	return thrd != 0p;
+}
+
+bool V(semaphore & this, unsigned diff) with( this ) {
+	$thread * thrd = 0p;
+	lock( lock __cfaabi_dbg_ctx2 );
+	int release = max(-count, (int)diff);
+	count += diff;
+	for(release) {
+		unpark( pop_head( waiting ) );
+	}
+
+	unlock( lock );
+
+	return thrd != 0p;
+}
Index: libcfa/src/concurrency/locks.hfa
===================================================================
--- libcfa/src/concurrency/locks.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/locks.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -13,5 +13,5 @@
 //-----------------------------------------------------------------------------
 // is_blocking_lock
-trait is_blocking_lock(dtype L | sized(L)) {
+trait is_blocking_lock(L & | sized(L)) {
 	// For synchronization locks to use when acquiring
 	void on_notify( L &, struct $thread * );
@@ -31,5 +31,5 @@
 // the info thread is a wrapper around a thread used
 // to store extra data for use in the condition variable
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 	struct info_thread;
 
@@ -120,5 +120,5 @@
 //-----------------------------------------------------------------------------
 // Synchronization Locks
-forall(dtype L | is_blocking_lock(L)) {
+forall(L & | is_blocking_lock(L)) {
 	struct condition_variable {
 		// Spin lock used for mutual exclusion
@@ -157,2 +157,16 @@
 	bool wait( condition_variable(L) & this, L & l, uintptr_t info, Time time );
 }
+
+//-----------------------------------------------------------------------------
+// Semaphore
+struct semaphore {
+	__spinlock_t lock;
+	int count;
+	__queue_t($thread) waiting;
+};
+
+void  ?{}(semaphore & this, int count = 1);
+void ^?{}(semaphore & this);
+bool   P (semaphore & this);
+bool   V (semaphore & this);
+bool   V (semaphore & this, unsigned count);
Index: libcfa/src/concurrency/monitor.cfa
===================================================================
--- libcfa/src/concurrency/monitor.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/monitor.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -50,5 +50,5 @@
 static inline [$thread *, int] search_entry_queue( const __waitfor_mask_t &, $monitor * monitors [], __lock_size_t count );
 
-forall(dtype T | sized( T ))
+forall(T & | sized( T ))
 static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val );
 static inline __lock_size_t count_max    ( const __waitfor_mask_t & mask );
@@ -949,5 +949,5 @@
 }
 
-forall(dtype T | sized( T ))
+forall(T & | sized( T ))
 static inline __lock_size_t insert_unique( T * array [], __lock_size_t & size, T * val ) {
 	if( !val ) return size;
Index: libcfa/src/concurrency/monitor.hfa
===================================================================
--- libcfa/src/concurrency/monitor.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/monitor.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -22,5 +22,5 @@
 #include "stdlib.hfa"
 
-trait is_monitor(dtype T) {
+trait is_monitor(T &) {
 	$monitor * get_monitor( T & );
 	void ^?{}( T & mutex );
@@ -59,5 +59,5 @@
 void ^?{}( monitor_dtor_guard_t & this );
 
-static inline forall( dtype T | sized(T) | { void ^?{}( T & mutex ); } )
+static inline forall( T & | sized(T) | { void ^?{}( T & mutex ); } )
 void delete( T * th ) {
 	^(*th){};
Index: libcfa/src/concurrency/mutex.cfa
===================================================================
--- libcfa/src/concurrency/mutex.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/mutex.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -164,5 +164,5 @@
 }
 
-forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void wait(condition_variable & this, L & l) {
 	lock( this.lock __cfaabi_dbg_ctx2 );
@@ -176,5 +176,5 @@
 //-----------------------------------------------------------------------------
 // Scopes
-forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void lock_all  ( L * locks[], size_t count) {
 	// Sort locks based on addresses
@@ -188,5 +188,5 @@
 }
 
-forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void unlock_all( L * locks[], size_t count) {
 	// Lock all
Index: libcfa/src/concurrency/mutex.hfa
===================================================================
--- libcfa/src/concurrency/mutex.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/mutex.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -70,5 +70,5 @@
 void unlock(recursive_mutex_lock & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 
-trait is_lock(dtype L | sized(L)) {
+trait is_lock(L & | sized(L)) {
 	void lock  (L &);
 	void unlock(L &);
@@ -94,10 +94,10 @@
 void wait(condition_variable & this) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 
-forall(dtype L | is_lock(L))
+forall(L & | is_lock(L))
 void wait(condition_variable & this, L & l) __attribute__((deprecated("use concurrency/locks.hfa instead")));
 
 //-----------------------------------------------------------------------------
 // Scopes
-forall(dtype L | is_lock(L)) {
+forall(L & | is_lock(L)) {
 	#if !defined( __TUPLE_ARRAYS_EXIST__ )
 	void lock  ( L * locks [], size_t count);
Index: libcfa/src/concurrency/preemption.cfa
===================================================================
--- libcfa/src/concurrency/preemption.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/preemption.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -616,4 +616,9 @@
 }
 
+// Prevent preemption since we are about to start terminating things
+void __kernel_abort_lock(void) {
+	signal_block( SIGUSR1 );
+}
+
 // Raii ctor/dtor for the preemption_scope
 // Used by thread to control when they want to receive preemption signals
Index: libcfa/src/concurrency/thread.cfa
===================================================================
--- libcfa/src/concurrency/thread.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/thread.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -62,7 +62,7 @@
 }
 
-FORALL_DATA_INSTANCE(ThreadCancelled, (dtype thread_t), (thread_t))
+FORALL_DATA_INSTANCE(ThreadCancelled, (thread_t &), (thread_t))
 
-forall(dtype T)
+forall(T &)
 void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src) {
 	dst->virtual_table = src->virtual_table;
@@ -71,23 +71,23 @@
 }
 
-forall(dtype T)
+forall(T &)
 const char * msg(ThreadCancelled(T) *) {
 	return "ThreadCancelled";
 }
 
-forall(dtype T)
+forall(T &)
 static void default_thread_cancel_handler(ThreadCancelled(T) & ) {
 	abort( "Unhandled thread cancellation.\n" );
 }
 
-forall(dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)))
 void ?{}( thread_dtor_guard_t & this,
-		T & thrd, void(*defaultResumptionHandler)(ThreadCancelled(T) &)) {
-	$monitor * m = get_monitor(thrd);
+		T & thrd, void(*cancelHandler)(ThreadCancelled(T) &)) {
+ 	$monitor * m = get_monitor(thrd);
 	$thread * desc = get_thread(thrd);
 
 	// Setup the monitor guard
 	void (*dtor)(T& mutex this) = ^?{};
-	bool join = defaultResumptionHandler != (void(*)(ThreadCancelled(T)&))0;
+	bool join = cancelHandler != (void(*)(ThreadCancelled(T)&))0;
 	(this.mg){&m, (void(*)())dtor, join};
 
@@ -103,7 +103,6 @@
 	}
 	desc->state = Cancelled;
-	if (!join) {
-		defaultResumptionHandler = default_thread_cancel_handler;
-	}
+	void(*defaultResumptionHandler)(ThreadCancelled(T) &) = 
+		join ? cancelHandler : default_thread_cancel_handler;
 
 	ThreadCancelled(T) except;
@@ -125,5 +124,5 @@
 //-----------------------------------------------------------------------------
 // Starting and stopping threads
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 void __thrd_start( T & this, void (*main_p)(T &) ) {
 	$thread * this_thrd = get_thread(this);
@@ -141,5 +140,5 @@
 //-----------------------------------------------------------------------------
 // Support for threads that don't ues the thread keyword
-forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+forall( T & | sized(T) | is_thread(T) | { void ?{}(T&); } )
 void ?{}( scoped(T)& this ) with( this ) {
 	handle{};
@@ -147,5 +146,5 @@
 }
 
-forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+forall( T &, P... | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
 void ?{}( scoped(T)& this, P params ) with( this ) {
 	handle{ params };
@@ -153,5 +152,5 @@
 }
 
-forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this ) with( this ) {
 	^handle{};
@@ -159,5 +158,5 @@
 
 //-----------------------------------------------------------------------------
-forall(dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
+forall(T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)))
 T & join( T & this ) {
 	thread_dtor_guard_t guard = { this, defaultResumptionHandler };
Index: libcfa/src/concurrency/thread.hfa
===================================================================
--- libcfa/src/concurrency/thread.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/concurrency/thread.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -26,5 +26,5 @@
 //-----------------------------------------------------------------------------
 // thread trait
-trait is_thread(dtype T) {
+trait is_thread(T &) {
 	void ^?{}(T& mutex this);
 	void main(T& this);
@@ -32,13 +32,13 @@
 };
 
-FORALL_DATA_EXCEPTION(ThreadCancelled, (dtype thread_t), (thread_t)) (
+FORALL_DATA_EXCEPTION(ThreadCancelled, (thread_t &), (thread_t)) (
 	thread_t * the_thread;
 	exception_t * the_exception;
 );
 
-forall(dtype T)
+forall(T &)
 void copy(ThreadCancelled(T) * dst, ThreadCancelled(T) * src);
 
-forall(dtype T)
+forall(T &)
 const char * msg(ThreadCancelled(T) *);
 
@@ -47,8 +47,8 @@
 
 // Inline getters for threads/coroutines/monitors
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline $coroutine* get_coroutine(T & this) __attribute__((const)) { return &get_thread(this)->self_cor; }
 
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline $monitor  * get_monitor  (T & this) __attribute__((const)) { return &get_thread(this)->self_mon; }
 
@@ -60,5 +60,5 @@
 extern struct cluster * mainCluster;
 
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 void __thrd_start( T & this, void (*)(T &) );
 
@@ -82,5 +82,5 @@
 };
 
-forall( dtype T | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_EXCEPTION(ThreadCancelled, (T)) )
 void ?{}( thread_dtor_guard_t & this, T & thrd, void(*)(ThreadCancelled(T) &) );
 void ^?{}( thread_dtor_guard_t & this );
@@ -89,16 +89,16 @@
 // thread runner
 // Structure that actually start and stop threads
-forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 struct scoped {
 	T handle;
 };
 
-forall( dtype T | sized(T) | is_thread(T) | { void ?{}(T&); } )
+forall( T & | sized(T) | is_thread(T) | { void ?{}(T&); } )
 void ?{}( scoped(T)& this );
 
-forall( dtype T, ttype P | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
+forall( T &, P... | sized(T) | is_thread(T) | { void ?{}(T&, P); } )
 void ?{}( scoped(T)& this, P params );
 
-forall( dtype T | sized(T) | is_thread(T) )
+forall( T & | sized(T) | is_thread(T) )
 void ^?{}( scoped(T)& this );
 
@@ -115,5 +115,5 @@
 void unpark( $thread * this );
 
-forall( dtype T | is_thread(T) )
+forall( T & | is_thread(T) )
 static inline void unpark( T & this ) { if(!&this) return; unpark( get_thread( this ) );}
 
@@ -128,5 +128,5 @@
 //----------
 // join
-forall( dtype T | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
+forall( T & | is_thread(T) | IS_RESUMPTION_EXCEPTION(ThreadCancelled, (T)) )
 T & join( T & this );
 
Index: libcfa/src/containers/list.hfa
===================================================================
--- libcfa/src/containers/list.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/list.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -66,5 +66,5 @@
 #define __DLISTED_MGD_JUSTIMPL(STRUCT)
 
-forall( dtype tE ) {
+forall( tE & ) {
 	struct $mgd_link {
 		tE *elem;
@@ -83,5 +83,5 @@
 		(this.is_terminator){ 1 };
 	}
-	forall ( otype tInit | { void ?{}( $mgd_link(tE) &, tInit); } )
+	forall ( tInit | { void ?{}( $mgd_link(tE) &, tInit); } )
 	static inline void ?=?( $mgd_link(tE) &this, tInit i ) {
 		^?{}( this );
@@ -115,5 +115,5 @@
   __DLISTED_MGD_COMMON(STRUCT, STRUCT, $links)
 
-trait $dlistable(dtype Tnode, dtype Telem) {
+trait $dlistable(Tnode &, Telem &) {
 	$mgd_link(Telem) & $prev_link(Tnode &);
 	$mgd_link(Telem) & $next_link(Tnode &);
@@ -125,5 +125,5 @@
 };
 
-forall (dtype Tnode, dtype Telem | $dlistable(Tnode, Telem)) {
+forall (Tnode &, Telem & | $dlistable(Tnode, Telem)) {
 
 	// implemented as a sentinel item in an underlying cicrular list
Index: libcfa/src/containers/maybe.cfa
===================================================================
--- libcfa/src/containers/maybe.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/maybe.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,10 +18,10 @@
 
 
-forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this) {
 	this.has_value = false;
 }
 
-forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, T value) {
 	this.has_value = true;
@@ -29,5 +29,5 @@
 }
 
-forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, maybe(T) other) {
 	this.has_value = other.has_value;
@@ -37,5 +37,5 @@
 }
 
-forall(otype T)
+forall(T)
 maybe(T) ?=?(maybe(T) & this, maybe(T) that) {
 	if (this.has_value && that.has_value) {
@@ -51,5 +51,5 @@
 }
 
-forall(otype T)
+forall(T)
 void ^?{}(maybe(T) & this) {
 	if (this.has_value) {
@@ -58,25 +58,25 @@
 }
 
-forall(otype T)
+forall(T)
 bool ?!=?(maybe(T) this, zero_t) {
 	return this.has_value;
 }
 
-forall(otype T)
+forall(T)
 maybe(T) maybe_value(T value) {
 	return (maybe(T)){value};
 }
 
-forall(otype T)
+forall(T)
 maybe(T) maybe_none() {
 	return (maybe(T)){};
 }
 
-forall(otype T)
+forall(T)
 bool has_value(maybe(T) * this) {
 	return this->has_value;
 }
 
-forall(otype T)
+forall(T)
 T get(maybe(T) * this) {
 	assertf(this->has_value, "attempt to get from maybe without value");
@@ -84,5 +84,5 @@
 }
 
-forall(otype T)
+forall(T)
 void set(maybe(T) * this, T value) {
 	if (this->has_value) {
@@ -94,5 +94,5 @@
 }
 
-forall(otype T)
+forall(T)
 void set_none(maybe(T) * this) {
 	if (this->has_value) {
Index: libcfa/src/containers/maybe.hfa
===================================================================
--- libcfa/src/containers/maybe.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/maybe.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -19,5 +19,5 @@
 
 // DO NOT USE DIRECTLY!
-forall(otype T)
+forall(T)
 struct maybe {
     bool has_value;
@@ -26,40 +26,40 @@
 
 
-forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this);
 
-forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, T value);
 
-forall(otype T)
+forall(T)
 void ?{}(maybe(T) & this, maybe(T) other);
 
-forall(otype T)
+forall(T)
 void ^?{}(maybe(T) & this);
 
-forall(otype T)
+forall(T)
 maybe(T) ?=?(maybe(T) & this, maybe(T) other);
 
-forall(otype T)
+forall(T)
 bool ?!=?(maybe(T) this, zero_t);
 
 /* Waiting for bug#11 to be fixed.
-forall(otype T)
+forall(T)
 maybe(T) maybe_value(T value);
 
-forall(otype T)
+forall(T)
 maybe(T) maybe_none();
 */
 
-forall(otype T)
+forall(T)
 bool has_value(maybe(T) * this);
 
-forall(otype T)
+forall(T)
 T get(maybe(T) * this);
 
-forall(otype T)
+forall(T)
 void set(maybe(T) * this, T value);
 
-forall(otype T)
+forall(T)
 void set_none(maybe(T) * this);
 
Index: libcfa/src/containers/pair.cfa
===================================================================
--- libcfa/src/containers/pair.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/pair.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -13,5 +13,5 @@
 #include <containers/pair.hfa>
 
-forall(otype R, otype S
+forall(R, S
 	| { int ?==?(R, R); int ?<?(R, R); int ?<?(S, S); })
 int ?<?(pair(R, S) p, pair(R, S) q) {
@@ -19,5 +19,5 @@
 }
 
-forall(otype R, otype S
+forall(R, S
 	| { int ?==?(R, R); int ?<?(R, R); int ?<=?(S, S); })
 int ?<=?(pair(R, S) p, pair(R, S) q) {
@@ -25,15 +25,15 @@
 }
 
-forall(otype R, otype S | { int ?==?(R, R); int ?==?(S, S); })
+forall(R, S | { int ?==?(R, R); int ?==?(S, S); })
 int ?==?(pair(R, S) p, pair(R, S) q) {
 	return p.first == q.first && p.second == q.second;
 }
 
-forall(otype R, otype S | { int ?!=?(R, R); int ?!=?(S, S); })
+forall(R, S | { int ?!=?(R, R); int ?!=?(S, S); })
 int ?!=?(pair(R, S) p, pair(R, S) q) {
 	return p.first != q.first || p.second != q.second;
 }
 
-forall(otype R, otype S
+forall(R, S
 	| { int ?==?(R, R); int ?>?(R, R); int ?>?(S, S); })
 int ?>?(pair(R, S) p, pair(R, S) q) {
@@ -41,5 +41,5 @@
 }
 
-forall(otype R, otype S
+forall(R, S
 	| { int ?==?(R, R); int ?>?(R, R); int ?>=?(S, S); })
 int ?>=?(pair(R, S) p, pair(R, S) q) {
Index: libcfa/src/containers/pair.hfa
===================================================================
--- libcfa/src/containers/pair.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/pair.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,28 +16,28 @@
 #pragma once
 
-forall(otype R, otype S) struct pair {
+forall(R, S) struct pair {
 	R first;
 	S second;
 };
 
-forall(otype R, otype S 
+forall(R, S 
 	| { int ?==?(R, R); int ?<?(R, R); int ?<?(S, S); })
 int ?<?(pair(R, S) p, pair(R, S) q);
 
-forall(otype R, otype S 
+forall(R, S 
 	| { int ?==?(R, R); int ?<?(R, R); int ?<=?(S, S); })
 int ?<=?(pair(R, S) p, pair(R, S) q);
 
-forall(otype R, otype S | { int ?==?(R, R); int ?==?(S, S); })
+forall(R, S | { int ?==?(R, R); int ?==?(S, S); })
 int ?==?(pair(R, S) p, pair(R, S) q);
 
-forall(otype R, otype S | { int ?!=?(R, R); int ?!=?(S, S); })
+forall(R, S | { int ?!=?(R, R); int ?!=?(S, S); })
 int ?!=?(pair(R, S) p, pair(R, S) q);
 
-forall(otype R, otype S 
+forall(R, S 
 	| { int ?==?(R, R); int ?>?(R, R); int ?>?(S, S); })
 int ?>?(pair(R, S) p, pair(R, S) q);
 
-forall(otype R, otype S 
+forall(R, S 
 	| { int ?==?(R, R); int ?>?(R, R); int ?>=?(S, S); })
 int ?>=?(pair(R, S) p, pair(R, S) q);
Index: libcfa/src/containers/result.cfa
===================================================================
--- libcfa/src/containers/result.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/result.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,5 +18,5 @@
 
 
-forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this) {
 	this.has_value = false;
@@ -24,5 +24,5 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, one_t, T value) {
 	this.has_value = true;
@@ -30,5 +30,5 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, zero_t, E error) {
 	this.has_value = false;
@@ -36,5 +36,5 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, result(T, E) other) {
 	this.has_value = other.has_value;
@@ -46,5 +46,5 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 result(T, E) ?=?(result(T, E) & this, result(T, E) that) {
 	if (this.has_value && that.has_value) {
@@ -63,5 +63,5 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 void ^?{}(result(T, E) & this) {
 	if (this.has_value) {
@@ -72,25 +72,25 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 bool ?!=?(result(T, E) this, zero_t) {
 	return this.has_value;
 }
 
-forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_value(T value) {
 	return (result(T, E)){1, value};
 }
 
-forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_error(E error) {
 	return (result(T, E)){0, error};
 }
 
-forall(otype T, otype E)
+forall(T, E)
 bool has_value(result(T, E) * this) {
 	return this->has_value;
 }
 
-forall(otype T, otype E)
+forall(T, E)
 T get(result(T, E) * this) {
 	assertf(this->has_value, "attempt to get from result without value");
@@ -98,5 +98,5 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 E get_error(result(T, E) * this) {
 	assertf(!this->has_value, "attempt to get from result without error");
@@ -104,5 +104,5 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 void set(result(T, E) * this, T value) {
 	if (this->has_value) {
@@ -115,5 +115,5 @@
 }
 
-forall(otype T, otype E)
+forall(T, E)
 void set_error(result(T, E) * this, E error) {
 	if (this->has_value) {
Index: libcfa/src/containers/result.hfa
===================================================================
--- libcfa/src/containers/result.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/result.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -19,5 +19,5 @@
 
 // DO NOT USE DIRECTLY!
-forall(otype T, otype E)
+forall(T, E)
 union inner_result{
 	T value;
@@ -25,5 +25,5 @@
 };
 
-forall(otype T, otype E)
+forall(T, E)
 struct result {
 	bool has_value;
@@ -32,46 +32,46 @@
 
 
-forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this);
 
-forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, one_t, T value);
 
-forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, zero_t, E error);
 
-forall(otype T, otype E)
+forall(T, E)
 void ?{}(result(T, E) & this, result(T, E) other);
 
-forall(otype T, otype E)
+forall(T, E)
 void ^?{}(result(T, E) & this);
 
-forall(otype T, otype E)
+forall(T, E)
 result(T, E) ?=?(result(T, E) & this, result(T, E) other);
 
-forall(otype T, otype E)
+forall(T, E)
 bool ?!=?(result(T, E) this, zero_t);
 
 /* Wating for bug#11 to be fixed.
-forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_value(T value);
 
-forall(otype T, otype E)
+forall(T, E)
 result(T, E) result_error(E error);
 */
 
-forall(otype T, otype E)
+forall(T, E)
 bool has_value(result(T, E) * this);
 
-forall(otype T, otype E)
+forall(T, E)
 T get(result(T, E) * this);
 
-forall(otype T, otype E)
+forall(T, E)
 E get_error(result(T, E) * this);
 
-forall(otype T, otype E)
+forall(T, E)
 void set(result(T, E) * this, T value);
 
-forall(otype T, otype E)
+forall(T, E)
 void set_error(result(T, E) * this, E error);
 
Index: libcfa/src/containers/stackLockFree.hfa
===================================================================
--- libcfa/src/containers/stackLockFree.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/stackLockFree.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -9,6 +9,6 @@
 // Created On       : Wed May 13 20:58:58 2020
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sun Jun 14 13:25:09 2020
-// Update Count     : 64
+// Last Modified On : Wed Jan 20 20:40:03 2021
+// Update Count     : 67
 //
 
@@ -17,5 +17,5 @@
 #include <stdint.h>
 
-forall( dtype T )
+forall( T & )
 union Link {
 	struct {											// 32/64-bit x 2
@@ -31,5 +31,5 @@
 }; // Link
 
-forall( otype T | sized(T) | { Link(T) * ?`next( T * ); } ) {
+forall( T | sized(T) | { Link(T) * ?`next( T * ); } ) {
 	struct StackLF {
 		Link(T) stack;
@@ -42,5 +42,5 @@
 
 		void push( StackLF(T) & this, T & n ) with(this) {
-			*( &n )`next = stack;					// atomic assignment unnecessary, or use CAA
+			*( &n )`next = stack;						// atomic assignment unnecessary, or use CAA
 			for () {									// busy wait
 			  if ( __atomic_compare_exchange_n( &stack.atom, &( &n )`next->atom, (Link(T))@{ {&n, ( &n )`next->count + 1} }.atom, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST ) ) break; // attempt to update top node
@@ -65,5 +65,5 @@
 				}
 				if( next == 0p ) return false;
-				link = (next)`next;
+				link = ( next )`next;
 			}
 		}
Index: libcfa/src/containers/vector.cfa
===================================================================
--- libcfa/src/containers/vector.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/vector.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,10 +18,10 @@
 #include <stdlib.hfa>
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void copy_internal(vector(T, allocator_t)* this, vector(T, allocator_t)* other);
 
 //------------------------------------------------------------------------------
 //Initialization
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ?{}(vector(T, allocator_t)& this)
 {
@@ -30,5 +30,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ?{}(vector(T, allocator_t)& this, vector(T, allocator_t) rhs)
 {
@@ -37,5 +37,5 @@
 }
 
-// forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+// forall(T, allocator_t | allocator_c(T, allocator_t))
 // vector(T, allocator_t) ?=?(vector(T, allocator_t)* this, vector(T, allocator_t) rhs)
 // {
@@ -45,5 +45,5 @@
 // }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ^?{}(vector(T, allocator_t)& this)
 {
@@ -54,5 +54,5 @@
 //------------------------------------------------------------------------------
 //Modifiers
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void push_back(vector(T, allocator_t)* this, T value)
 {
@@ -62,5 +62,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void pop_back(vector(T, allocator_t)* this)
 {
@@ -69,5 +69,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void clear(vector(T, allocator_t)* this)
 {
@@ -82,5 +82,5 @@
 //Internal Helpers
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void copy_internal(vector(T, allocator_t)* this, vector(T, allocator_t)* other)
 {
@@ -93,5 +93,5 @@
 //------------------------------------------------------------------------------
 //Allocator
-forall(otype T)
+forall(T)
 void ?{}(heap_allocator(T)& this)
 {
@@ -100,5 +100,5 @@
 }
 
-forall(otype T)
+forall(T)
 void ?{}(heap_allocator(T)& this, heap_allocator(T) rhs)
 {
@@ -107,5 +107,5 @@
 }
 
-forall(otype T)
+forall(T)
 heap_allocator(T) ?=?(heap_allocator(T)& this, heap_allocator(T) rhs)
 {
@@ -115,5 +115,5 @@
 }
 
-forall(otype T)
+forall(T)
 void ^?{}(heap_allocator(T)& this)
 {
@@ -121,5 +121,5 @@
 }
 
-forall(otype T)
+forall(T)
 inline void realloc_storage(heap_allocator(T)* this, size_t size)
 {
Index: libcfa/src/containers/vector.hfa
===================================================================
--- libcfa/src/containers/vector.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/containers/vector.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -20,5 +20,5 @@
 //------------------------------------------------------------------------------
 //Allocator
-forall(otype T)
+forall(T)
 struct heap_allocator
 {
@@ -27,20 +27,20 @@
 };
 
-forall(otype T)
+forall(T)
 void ?{}(heap_allocator(T)& this);
 
-forall(otype T)
+forall(T)
 void ?{}(heap_allocator(T)& this, heap_allocator(T) rhs);
 
-forall(otype T)
+forall(T)
 heap_allocator(T) ?=?(heap_allocator(T)& this, heap_allocator(T) rhs);
 
-forall(otype T)
+forall(T)
 void ^?{}(heap_allocator(T)& this);
 
-forall(otype T)
+forall(T)
 void realloc_storage(heap_allocator(T)* this, size_t size);
 
-forall(otype T)
+forall(T)
 static inline T* data(heap_allocator(T)* this)
 {
@@ -50,5 +50,5 @@
 //------------------------------------------------------------------------------
 //Declaration
-trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
 {
 	void realloc_storage(allocator_t*, size_t);
@@ -56,22 +56,22 @@
 };
 
-forall(otype T, otype allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
+forall(T, allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
 struct vector;
 
 //------------------------------------------------------------------------------
 //Initialization
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ?{}(vector(T, allocator_t)& this);
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ?{}(vector(T, allocator_t)& this, vector(T, allocator_t) rhs);
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 vector(T, allocator_t) ?=?(vector(T, allocator_t)& this, vector(T, allocator_t) rhs);
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void ^?{}(vector(T, allocator_t)& this);
 
-forall(otype T, otype allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
+forall(T, allocator_t = heap_allocator(T) | allocator_c(T, allocator_t))
 struct vector
 {
@@ -82,5 +82,5 @@
 //------------------------------------------------------------------------------
 //Capacity
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline bool empty(vector(T, allocator_t)* this)
 {
@@ -88,5 +88,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline size_t size(vector(T, allocator_t)* this)
 {
@@ -94,5 +94,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline void reserve(vector(T, allocator_t)* this, size_t size)
 {
@@ -102,5 +102,5 @@
 //------------------------------------------------------------------------------
 //Element access
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T at(vector(T, allocator_t)* this, size_t index)
 {
@@ -108,5 +108,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T ?[?](vector(T, allocator_t)* this, size_t index)
 {
@@ -114,5 +114,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T front(vector(T, allocator_t)* this)
 {
@@ -120,5 +120,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T back(vector(T, allocator_t)* this)
 {
@@ -128,16 +128,16 @@
 //------------------------------------------------------------------------------
 //Modifiers
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void push_back(vector(T, allocator_t)* this, T value);
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void pop_back(vector(T, allocator_t)* this);
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void clear(vector(T, allocator_t)* this);
 
 //------------------------------------------------------------------------------
 //Iterators
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T* begin(vector(T, allocator_t)* this)
 {
@@ -145,5 +145,5 @@
 }
 
-// forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+// forall(T, allocator_t | allocator_c(T, allocator_t))
 // static inline const T* cbegin(const vector(T, allocator_t)* this)
 // {
@@ -151,5 +151,5 @@
 // }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 static inline T* end(vector(T, allocator_t)* this)
 {
@@ -157,5 +157,5 @@
 }
 
-// forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+// forall(T, allocator_t | allocator_c(T, allocator_t))
 // static inline const T* cend(const vector(T, allocator_t)* this)
 // {
Index: libcfa/src/exception.h
===================================================================
--- libcfa/src/exception.h	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/exception.h	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -101,5 +101,5 @@
 // implemented in the .c file either so they all have to be inline.
 
-trait is_exception(dtype exceptT, dtype virtualT) {
+trait is_exception(exceptT &, virtualT &) {
 	/* The first field must be a pointer to a virtual table.
 	 * That virtual table must be a decendent of the base exception virtual table.
@@ -109,13 +109,13 @@
 };
 
-trait is_termination_exception(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT)) {
+trait is_termination_exception(exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
 	void defaultTerminationHandler(exceptT &);
 };
 
-trait is_resumption_exception(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT)) {
+trait is_resumption_exception(exceptT &, virtualT & | is_exception(exceptT, virtualT)) {
 	void defaultResumptionHandler(exceptT &);
 };
 
-forall(dtype exceptT, dtype virtualT | is_termination_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_termination_exception(exceptT, virtualT))
 static inline void $throw(exceptT & except) {
 	__cfaehm_throw_terminate(
@@ -125,5 +125,5 @@
 }
 
-forall(dtype exceptT, dtype virtualT | is_resumption_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_resumption_exception(exceptT, virtualT))
 static inline void $throwResume(exceptT & except) {
 	__cfaehm_throw_resume(
@@ -133,15 +133,15 @@
 }
 
-forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void cancel_stack(exceptT & except) __attribute__((noreturn)) {
 	__cfaehm_cancel_stack( (exception_t *)&except );
 }
 
-forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void defaultTerminationHandler(exceptT & except) {
 	return cancel_stack( except );
 }
 
-forall(dtype exceptT, dtype virtualT | is_exception(exceptT, virtualT))
+forall(exceptT &, virtualT & | is_exception(exceptT, virtualT))
 static inline void defaultResumptionHandler(exceptT & except) {
 	throw except;
Index: libcfa/src/executor.cfa
===================================================================
--- libcfa/src/executor.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/executor.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -7,5 +7,5 @@
 #include <containers/list.hfa>
 
-forall( dtype T | $dlistable(T, T) ) {
+forall( T & | $dlistable(T, T) ) {
 	monitor Buffer {									// unbounded buffer
 		dlist( T, T ) queue;							// unbounded list of work requests
Index: libcfa/src/gmp.hfa
===================================================================
--- libcfa/src/gmp.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/gmp.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -255,5 +255,5 @@
 
 	// I/O
-	forall( dtype istype | istream( istype ) )
+	forall( istype & | istream( istype ) )
 		istype & ?|?( istype & is, Int & mp ) {
 		gmp_scanf( "%Zd", &mp );
@@ -261,5 +261,5 @@
 	} // ?|?
 
-	forall( dtype ostype | ostream( ostype ) ) {
+	forall( ostype & | ostream( ostype ) ) {
 		ostype & ?|?( ostype & os, Int mp ) {
 			if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
Index: libcfa/src/interpose.cfa
===================================================================
--- libcfa/src/interpose.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/interpose.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -125,12 +125,17 @@
 
 		// Failure handler
-		__cfaabi_sigaction( SIGSEGV, sigHandler_segv, SA_SIGINFO | SA_ONSTACK );
-		__cfaabi_sigaction( SIGBUS , sigHandler_segv, SA_SIGINFO | SA_ONSTACK );
-		__cfaabi_sigaction( SIGILL , sigHandler_ill , SA_SIGINFO | SA_ONSTACK );
-		__cfaabi_sigaction( SIGFPE , sigHandler_fpe , SA_SIGINFO | SA_ONSTACK );
-		__cfaabi_sigaction( SIGTERM, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // one shot handler, return to default
-		__cfaabi_sigaction( SIGINT , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND );
-		__cfaabi_sigaction( SIGABRT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND );
-		__cfaabi_sigaction( SIGHUP , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // terminal hangup
+		 // internal errors
+		__cfaabi_sigaction( SIGSEGV, sigHandler_segv, SA_SIGINFO | SA_ONSTACK ); // Invalid memory reference (default: Core)
+		__cfaabi_sigaction( SIGBUS , sigHandler_segv, SA_SIGINFO | SA_ONSTACK ); // Bus error, bad memory access (default: Core)
+		__cfaabi_sigaction( SIGILL , sigHandler_ill , SA_SIGINFO | SA_ONSTACK ); // Illegal Instruction (default: Core)
+		__cfaabi_sigaction( SIGFPE , sigHandler_fpe , SA_SIGINFO | SA_ONSTACK ); // Floating-point exception (default: Core)
+
+ 		// handlers to outside errors
+		// reset in-case they insist and send it over and over
+		__cfaabi_sigaction( SIGTERM, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Termination signal (default: Term)
+		__cfaabi_sigaction( SIGINT , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Interrupt from keyboard (default: Term)
+		__cfaabi_sigaction( SIGHUP , sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Hangup detected on controlling terminal or death of controlling process (default: Term)
+		__cfaabi_sigaction( SIGQUIT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Quit from keyboard (default: Core)
+		__cfaabi_sigaction( SIGABRT, sigHandler_term, SA_SIGINFO | SA_ONSTACK | SA_RESETHAND ); // Abort signal from abort(3) (default: Core)
 	}
 }
@@ -163,8 +168,8 @@
 }
 
-void * kernel_abort( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 0p; }
-void kernel_abort_msg( void * data, char buffer[], int size ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
-// See concurrency/kernel.cfa for strong definition used in multi-processor mode.
-int kernel_abort_lastframe( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 4; }
+// See concurrency/kernel.cfa and concurrency/preemption.cfa for strong definition used in multi-processor mode.
+void __kernel_abort_lock( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
+void __kernel_abort_msg( char buffer[], int size ) __attribute__(( __nothrow__, __leaf__, __weak__ )) {}
+int __kernel_abort_lastframe( void ) __attribute__(( __nothrow__, __leaf__, __weak__ )) { return 4; }
 
 enum { abort_text_size = 1024 };
@@ -173,5 +178,5 @@
 static void __cfaabi_backtrace( int start ) {
 	enum { Frames = 50, };								// maximum number of stack frames
-	int last = kernel_abort_lastframe();				// skip last N stack frames
+	int last = __kernel_abort_lastframe();				// skip last N stack frames
 
 	void * array[Frames];
@@ -220,50 +225,44 @@
 }
 
-static volatile int __abort_stage = 0;
+static volatile bool __abort_first = 0;
 
 // Cannot forward va_list.
 void __abort( bool signalAbort, const char fmt[], va_list args ) {
-	int stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
-
-	// First stage: stop the cforall kernel and print
-	if(stage == 1) {
-		// increment stage
-		stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
-
-		// must be done here to lock down kernel
-		void * kernel_data = kernel_abort();
-		int len;
-
-		signal( SIGABRT, SIG_DFL );							// prevent final "real" abort from recursing to handler
-
-		len = snprintf( abort_text, abort_text_size, "Cforall Runtime error (UNIX pid:%ld) ", (long int)getpid() ); // use UNIX pid (versus getPid)
-		__cfaabi_bits_write( STDERR_FILENO, abort_text, len );
-
-		assert( fmt );
-		len = vsnprintf( abort_text, abort_text_size, fmt, args );
-		__cfaabi_bits_write( STDERR_FILENO, abort_text, len );
-
-		// add optional newline if missing at the end of the format text
-		if ( fmt[strlen( fmt ) - 1] != '\n' ) {
-			__cfaabi_bits_write( STDERR_FILENO, "\n", 1 );
-		} // if
-		kernel_abort_msg( kernel_data, abort_text, abort_text_size );
-	}
-
-	// Second stage: print the backtrace
-	if(stage == 2) {
-		// increment stage
-		stage = __atomic_add_fetch( &__abort_stage, 1, __ATOMIC_SEQ_CST );
-
-		// print stack trace in handler
-		__cfaabi_backtrace( signalAbort ? 4 : 2 );
-	}
-
-	do {
-		// Finally call abort
+	// Multiple threads can come here from multiple paths
+	// To make sure this is safe any concurrent/subsequent call to abort is redirected to libc-abort
+	bool first = ! __atomic_test_and_set( &__abort_first, __ATOMIC_SEQ_CST);
+
+	// Prevent preemption from kicking-in and messing with the abort
+	__kernel_abort_lock();
+
+	// first to abort ?
+	if ( !first ) {
+		// We aren't the first to abort just let C handle it
+		signal( SIGABRT, SIG_DFL );	// restore default in case we came here through the function.
 		__cabi_libc.abort();
-
-		// Loop so that we never return
-	} while(true);
+	}
+
+	int len = snprintf( abort_text, abort_text_size, "Cforall Runtime error (UNIX pid:%ld) ", (long int)getpid() ); // use UNIX pid (versus getPid)
+	__cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+
+	// print the cause of the error
+	assert( fmt );
+	len = vsnprintf( abort_text, abort_text_size, fmt, args );
+	__cfaabi_bits_write( STDERR_FILENO, abort_text, len );
+
+	// add optional newline if missing at the end of the format text
+	if ( fmt[strlen( fmt ) - 1] != '\n' ) {
+		__cfaabi_bits_write( STDERR_FILENO, "\n", 1 );
+	} // if
+
+	// Give the kernel the chance to add some data in here
+	__kernel_abort_msg( abort_text, abort_text_size );
+
+	// print stack trace in handler
+	__cfaabi_backtrace( signalAbort ? 4 : 2 );
+
+	// Finally call abort
+	__cabi_libc.abort();
+
 }
 
@@ -282,10 +281,4 @@
     // CONTROL NEVER REACHES HERE!
     va_end( args );
-}
-
-extern "C" {
-	void __cfaabi_real_abort(void) {
-		__cabi_libc.abort();
-	}
 }
 
Index: libcfa/src/iostream.cfa
===================================================================
--- libcfa/src/iostream.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/iostream.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -36,5 +36,5 @@
 
 
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype & os, bool b ) {
 		if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) );
@@ -402,5 +402,5 @@
 
 // tuples
-forall( dtype ostype, otype T, ttype Params | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
+forall( ostype &, T, Params... | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
 	ostype & ?|?( ostype & os, T arg, Params rest ) {
 		(ostype &)(os | arg);							// print first argument
@@ -421,5 +421,5 @@
 
 // writes the range [begin, end) to the given stream
-forall( dtype ostype, otype elt_type | writeable( elt_type, ostype ), otype iterator_type | iterator( iterator_type, elt_type ) ) {
+forall( ostype &, elt_type | writeable( elt_type, ostype ), iterator_type | iterator( iterator_type, elt_type ) ) {
 	void write( iterator_type begin, iterator_type end, ostype & os ) {
 		void print( elt_type i ) { os | i; }
@@ -442,5 +442,5 @@
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 #define IntegralFMTImpl( T, IFMTNP, IFMTP ) \
-forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
 	ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
 		if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
@@ -535,5 +535,5 @@
 // Default prefix for non-decimal prints is 0b, 0, 0x.
 #define IntegralFMTImpl128( T, SIGNED, CODE, IFMTNP, IFMTP ) \
-forall( dtype ostype | ostream( ostype ) ) \
+forall( ostype & | ostream( ostype ) ) \
 static void base10_128( ostype & os, _Ostream_Manip(T) f ) { \
 	if ( f.val > UINT64_MAX ) { \
@@ -552,5 +552,5 @@
 	} /* if */ \
 } /* base10_128 */ \
-forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
 	ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
 		if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
@@ -654,5 +654,5 @@
 #if defined( __SIZEOF_INT128__ )
 // Default prefix for non-decimal prints is 0b, 0, 0x.
-forall( dtype ostype | ostream( ostype ) )
+forall( ostype & | ostream( ostype ) )
 static inline void base_128( ostype & os, unsigned int128 val, unsigned int128 power, _Ostream_Manip(uint64_t) & f, unsigned int maxdig, unsigned int bits, unsigned int cnt = 0 ) {
 	int wd = 1;											// f.wd is never 0 because 0 implies left-pad
@@ -719,5 +719,5 @@
 
 #define IntegralFMTImpl128( T ) \
-forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
 	ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
 		_Ostream_Manip(uint64_t) fmt; \
@@ -767,5 +767,5 @@
 
 #define FloatingPointFMTImpl( T, DFMTNP, DFMTP ) \
-forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
 	ostype & ?|?( ostype & os, _Ostream_Manip(T) f ) { \
 		if ( $sepPrt( os ) ) fmt( os, "%s", $sepGetCur( os ) ); \
@@ -801,5 +801,5 @@
 // *********************************** character ***********************************
 
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype & os, _Ostream_Manip(char) f ) {
 		if ( f.base != 'c' ) {							// bespoke binary/octal/hex format
@@ -834,5 +834,5 @@
 // *********************************** C string ***********************************
 
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype & os, _Ostream_Manip(const char *) f ) {
 		if ( ! f.val ) return os;						// null pointer ?
@@ -882,5 +882,5 @@
 
 
-forall( dtype istype | istream( istype ) ) {
+forall( istype & | istream( istype ) ) {
 	istype & ?|?( istype & is, bool & b ) {
 		char val[6];
@@ -1048,5 +1048,5 @@
 // *********************************** manipulators ***********************************
 
-forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Cstr f ) {
 	// skip xxx
@@ -1083,5 +1083,5 @@
 } // ?|?
 
-forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Char f ) {
 	fmt( is, "%*c" );									// argument variable unused
@@ -1090,5 +1090,5 @@
 
 #define InputFMTImpl( T, CODE ) \
-forall( dtype istype | istream( istype ) ) \
+forall( istype & | istream( istype ) ) \
 istype & ?|?( istype & is, _Istream_Manip(T) f ) { \
 	enum { size = 16 }; \
@@ -1119,5 +1119,5 @@
 InputFMTImpl( long double, "Lf" )
 
-forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Manip(float _Complex) fc ) {
 	float re, im;
@@ -1130,5 +1130,5 @@
 } // ?|?
 
-forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Manip(double _Complex) dc ) {
 	double re, im;
@@ -1141,5 +1141,5 @@
 } // ?|?
 
-forall( dtype istype | istream( istype ) )
+forall( istype & | istream( istype ) )
 istype & ?|?( istype & is, _Istream_Manip(long double _Complex) ldc ) {
 	long double re, im;
Index: libcfa/src/iostream.hfa
===================================================================
--- libcfa/src/iostream.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/iostream.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -22,5 +22,5 @@
 
 
-trait ostream( dtype ostype ) {
+trait ostream( ostype & ) {
 	// private
 	bool $sepPrt( ostype & );							// get separator state (on/off)
@@ -56,9 +56,9 @@
 }; // ostream
 
-// trait writeable( otype T ) {
-// 	forall( dtype ostype | ostream( ostype ) ) ostype & ?|?( ostype &, T );
+// trait writeable( T ) {
+// 	forall( ostype & | ostream( ostype ) ) ostype & ?|?( ostype &, T );
 // }; // writeable
 
-trait writeable( otype T, dtype ostype | ostream( ostype ) ) {
+trait writeable( T, ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype &, T );
 }; // writeable
@@ -66,5 +66,5 @@
 // implement writable for intrinsic types
 
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype &, bool );
 	void ?|?( ostype &, bool );
@@ -140,5 +140,5 @@
 
 // tuples
-forall( dtype ostype, otype T, ttype Params | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
+forall( ostype &, T, Params... | writeable( T, ostype ) | { ostype & ?|?( ostype &, Params ); } ) {
 	ostype & ?|?( ostype & os, T arg, Params rest );
 	void ?|?( ostype & os, T arg, Params rest );
@@ -146,5 +146,5 @@
 
 // writes the range [begin, end) to the given stream
-forall( dtype ostype, otype elt_type | writeable( elt_type, ostype ), otype iterator_type | iterator( iterator_type, elt_type ) ) {
+forall( ostype &, elt_type | writeable( elt_type, ostype ), iterator_type | iterator( iterator_type, elt_type ) ) {
 	void write( iterator_type begin, iterator_type end, ostype & os );
 	void write_reverse( iterator_type begin, iterator_type end, ostype & os );
@@ -153,5 +153,5 @@
 // *********************************** manipulators ***********************************
 
-forall( otype T )
+forall( T )
 struct _Ostream_Manip {
 	T val;												// polymorphic base-type
@@ -193,5 +193,5 @@
 	_Ostream_Manip(T) & sign( _Ostream_Manip(T) & fmt ) { fmt.flags.sign = true; return fmt; } \
 } /* distribution */ \
-forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
 	ostype & ?|?( ostype & os, _Ostream_Manip(T) f ); \
 	void ?|?( ostype & os, _Ostream_Manip(T) f ); \
@@ -234,5 +234,5 @@
 	_Ostream_Manip(T) & nodp( _Ostream_Manip(T) & fmt ) { fmt.flags.nobsdp = true; return fmt; } \
 } /* distribution */ \
-forall( dtype ostype | ostream( ostype ) ) { \
+forall( ostype & | ostream( ostype ) ) { \
 	ostype & ?|?( ostype & os, _Ostream_Manip(T) f ); \
 	void ?|?( ostype & os, _Ostream_Manip(T) f ); \
@@ -254,5 +254,5 @@
 	_Ostream_Manip(char) & nobase( _Ostream_Manip(char) & fmt ) { fmt.flags.nobsdp = true; return fmt; }
 } // distribution
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype & os, _Ostream_Manip(char) f );
 	void ?|?( ostype & os, _Ostream_Manip(char) f );
@@ -272,5 +272,5 @@
 	_Ostream_Manip(const char *) & nobase( _Ostream_Manip(const char *) & fmt ) { fmt.flags.nobsdp = true; return fmt; }
 } // distribution
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype & os, _Ostream_Manip(const char *) f );
 	void ?|?( ostype & os, _Ostream_Manip(const char *) f );
@@ -281,5 +281,5 @@
 
 
-trait istream( dtype istype ) {
+trait istream( istype & ) {
 	void nlOn( istype & );								// read newline
 	void nlOff( istype & );								// scan newline
@@ -294,9 +294,9 @@
 }; // istream
 
-trait readable( otype T ) {
-	forall( dtype istype | istream( istype ) ) istype & ?|?( istype &, T );
+trait readable( T ) {
+	forall( istype & | istream( istype ) ) istype & ?|?( istype &, T );
 }; // readable
 
-forall( dtype istype | istream( istype ) ) {
+forall( istype & | istream( istype ) ) {
 	istype & ?|?( istype &, bool & );
 
@@ -363,5 +363,5 @@
 	_Istream_Cstr & wdi( unsigned int w, _Istream_Cstr & fmt ) { fmt.wd = w; return fmt; }
 } // distribution
-forall( dtype istype | istream( istype ) ) istype & ?|?( istype & is, _Istream_Cstr f );
+forall( istype & | istream( istype ) ) istype & ?|?( istype & is, _Istream_Cstr f );
 
 struct _Istream_Char {
@@ -373,7 +373,7 @@
 	_Istream_Char & ignore( _Istream_Char & fmt ) { fmt.ignore = true; return fmt; }
 } // distribution
-forall( dtype istype | istream( istype ) ) istype & ?|?( istype & is, _Istream_Char f );
-
-forall( dtype T | sized( T ) )
+forall( istype & | istream( istype ) ) istype & ?|?( istype & is, _Istream_Char f );
+
+forall( T & | sized( T ) )
 struct _Istream_Manip {
 	T & val;											// polymorphic base-type
@@ -389,5 +389,5 @@
 	_Istream_Manip(T) & wdi( unsigned int w, _Istream_Manip(T) & fmt ) { fmt.wd = w; return fmt; } \
 } /* distribution */ \
-forall( dtype istype | istream( istype ) ) { \
+forall( istype & | istream( istype ) ) { \
 	istype & ?|?( istype & is, _Istream_Manip(T) f ); \
 } // ?|?
@@ -418,5 +418,5 @@
 #include <time_t.hfa>									// Duration (constructors) / Time (constructors)
 
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype & os, Duration dur );
 	void ?|?( ostype & os, Duration dur );
Index: libcfa/src/iterator.cfa
===================================================================
--- libcfa/src/iterator.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/iterator.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,5 +16,5 @@
 #include "iterator.hfa"
 
-forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each( iterator_type begin, iterator_type end, void (* func)( elt_type ) ) {
 	for ( iterator_type i = begin; i != end; ++i ) {
@@ -23,5 +23,5 @@
 } // for_each
 
-forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each_reverse( iterator_type begin, iterator_type end, void (* func)( elt_type ) ) {
 	for ( iterator_type i = end; i != begin; ) {
Index: libcfa/src/iterator.hfa
===================================================================
--- libcfa/src/iterator.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/iterator.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -17,5 +17,5 @@
 
 // An iterator can be used to traverse a data structure.
-trait iterator( otype iterator_type, otype elt_type ) {
+trait iterator( iterator_type, elt_type ) {
 	// point to the next element
 //	iterator_type ?++( iterator_type & );
@@ -31,5 +31,5 @@
 };
 
-trait iterator_for( otype iterator_type, otype collection_type, otype elt_type | iterator( iterator_type, elt_type ) ) {
+trait iterator_for( iterator_type, collection_type, elt_type | iterator( iterator_type, elt_type ) ) {
 //	[ iterator_type begin, iterator_type end ] get_iterators( collection_type );
 	iterator_type begin( collection_type );
@@ -37,8 +37,8 @@
 };
 
-forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each( iterator_type begin, iterator_type end, void (* func)( elt_type ) );
 
-forall( otype iterator_type, otype elt_type | iterator( iterator_type, elt_type ) )
+forall( iterator_type, elt_type | iterator( iterator_type, elt_type ) )
 void for_each_reverse( iterator_type begin, iterator_type end, void (* func)( elt_type ) );
 
Index: libcfa/src/math.hfa
===================================================================
--- libcfa/src/math.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/math.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -286,5 +286,5 @@
 	unsigned long long int floor( unsigned long long int n, unsigned long long int align ) { return n / align * align; }
 
-	// forall( otype T | { T ?/?( T, T ); T ?*?( T, T ); } )
+	// forall( T | { T ?/?( T, T ); T ?*?( T, T ); } )
 	// T floor( T n, T align ) { return n / align * align; }
 
@@ -300,5 +300,5 @@
 	unsigned long long int ceiling_div( unsigned long long int n, unsigned long long int align ) { return (n + (align - 1)) / align; }
 
-	// forall( otype T | { T ?+?( T, T ); T ?-?( T, T ); T ?%?( T, T ); } )
+	// forall( T | { T ?+?( T, T ); T ?-?( T, T ); T ?%?( T, T ); } )
 	// T ceiling_div( T n, T align ) { verify( is_pow2( align ) );return (n + (align - 1)) / align; }
 	
@@ -315,5 +315,5 @@
 	unsigned long long int ceiling( unsigned long long int n, unsigned long long int align ) { return floor( n + (n % align != 0 ? align - 1 : 0), align ); }
 
-	// forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T ); T ?/?( T, T ); } )
+	// forall( T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T ); T ?/?( T, T ); } )
 	// T ceiling( T n, T align ) { return return floor( n + (n % align != 0 ? align - 1 : 0), align ); *}
 
@@ -414,11 +414,11 @@
 
 static inline {
-	forall( otype T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T );T ?*?( T, T ); } )
+	forall( T | { void ?{}( T &, one_t ); T ?+?( T, T ); T ?-?( T, T );T ?*?( T, T ); } )
 	T lerp( T x, T y, T a ) { return x * ((T){1} - a) + y * a; }
 
-	forall( otype T | { void ?{}( T &, zero_t ); void ?{}( T &, one_t ); int ?<?( T, T ); } )
+	forall( T | { void ?{}( T &, zero_t ); void ?{}( T &, one_t ); int ?<?( T, T ); } )
 	T step( T edge, T x ) { return x < edge ? (T){0} : (T){1}; }
 
-	forall( otype T | { void ?{}( T &, int ); T clamp( T, T, T ); T ?-?( T, T ); T ?*?( T, T ); T ?/?( T, T ); } )
+	forall( T | { void ?{}( T &, int ); T clamp( T, T, T ); T ?-?( T, T ); T ?*?( T, T ); T ?/?( T, T ); } )
 	T smoothstep( T edge0, T edge1, T x ) { T t = clamp( (x - edge0) / (edge1 - edge0), (T){0}, (T){1} ); return t * t * ((T){3} - (T){2} * t); }
 } // distribution
Index: libcfa/src/memory.cfa
===================================================================
--- libcfa/src/memory.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/memory.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,5 +18,5 @@
 
 // Internal data object.
-forall(dtype T | sized(T), ttype Args | { void ?{}(T &, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T &, Args); })
 void ?{}(counter_data(T) & this, Args args) {
 	(this.counter){1};
@@ -24,5 +24,5 @@
 }
 
-forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ^?{}(counter_data(T) & this) {
 	assert(0 == this.counter);
@@ -31,15 +31,15 @@
 
 // This is one of many pointers keeping this alive.
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 void ?{}(counter_ptr(T) & this) {
 	this.data = 0p;
 }
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 void ?{}(counter_ptr(T) & this, zero_t) {
 	this.data = 0p;
 }
 
-forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 static void internal_decrement(counter_ptr(T) & this) {
 	if (this.data && 0 == --this.data->counter) {
@@ -48,5 +48,5 @@
 }
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 static void internal_copy(counter_ptr(T) & this, counter_ptr(T) & that) {
 	this.data = that.data;
@@ -56,5 +56,5 @@
 }
 
-forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ?{}(counter_ptr(T) & this, counter_ptr(T) that) {
 	// `that` is a copy but it should have neither a constructor
@@ -64,20 +64,20 @@
 }
 
-forall(dtype T | sized(T), ttype Args | { void ?{}(T&, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T&, Args); })
 void ?{}(counter_ptr(T) & this, Args args) {
 	this.data = (counter_data(T)*)new(args);
 }
 
-forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ^?{}(counter_ptr(T) & this) {
 	internal_decrement(this);
 }
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 T & *?(counter_ptr(T) & this) {
 	return *((this.data) ? &this.data->object : 0p);
 }
 
-forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ?=?(counter_ptr(T) & this, counter_ptr(T) that) {
 	if (this.data != that.data) {
@@ -87,5 +87,5 @@
 }
 
-forall(dtype T | sized(T) | { void ^?{}(T &); })
+forall(T & | sized(T) | { void ^?{}(T &); })
 void ?=?(counter_ptr(T) & this, zero_t) {
 	internal_decrement(this);
@@ -93,20 +93,20 @@
 }
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?==?(counter_ptr(T) const & this, counter_ptr(T) const & that) {
 	return this.data == that.data;
 }
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?!=?(counter_ptr(T) const & this, counter_ptr(T) const & that) {
 	return !?==?(this, that);
 }
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?==?(counter_ptr(T) const & this, zero_t) {
 	return this.data == 0;
 }
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 int ?!=?(counter_ptr(T) const & this, zero_t) {
 	return !?==?(this, (zero_t)0);
@@ -114,30 +114,30 @@
 
 // This is the only pointer that keeps this alive.
-forall(dtype T)
+forall(T &)
 void ?{}(unique_ptr(T) & this) {
 	this.data = 0p;
 }
 
-forall(dtype T)
+forall(T &)
 void ?{}(unique_ptr(T) & this, zero_t) {
 	this.data = 0p;
 }
 
-forall(dtype T | sized(T), ttype Args | { void ?{}(T &, Args); })
+forall(T & | sized(T), Args... | { void ?{}(T &, Args); })
 void ?{}(unique_ptr(T) & this, Args args) {
 	this.data = (T *)new(args);
 }
 
-forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void ^?{}(unique_ptr(T) & this) {
 	delete(this.data);
 }
 
-forall(dtype T)
+forall(T &)
 T & *?(unique_ptr(T) & this) {
 	return *this.data;
 }
 
-forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void ?=?(unique_ptr(T) & this, zero_t) {
 	delete(this.data);
@@ -145,5 +145,5 @@
 }
 
-forall(dtype T | { void ^?{}(T &); })
+forall(T & | { void ^?{}(T &); })
 void move(unique_ptr(T) & this, unique_ptr(T) & that) {
 	delete(this.data);
@@ -152,20 +152,20 @@
 }
 
-forall(dtype T)
+forall(T &)
 int ?==?(unique_ptr(T) const & this, unique_ptr(T) const & that) {
 	return this.data == that.data;
 }
 
-forall(dtype T)
+forall(T &)
 int ?!=?(unique_ptr(T) const & this, unique_ptr(T) const & that) {
 	return !?==?(this, that);
 }
 
-forall(dtype T)
+forall(T &)
 int ?==?(unique_ptr(T) const & this, zero_t) {
 	return this.data == 0;
 }
 
-forall(dtype T)
+forall(T &)
 int ?!=?(unique_ptr(T) const & this, zero_t) {
 	return !?==?(this, (zero_t)0);
Index: libcfa/src/memory.hfa
===================================================================
--- libcfa/src/memory.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/memory.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -17,5 +17,5 @@
 
 // Internal data object.
-forall(dtype T | sized(T)) {
+forall(T & | sized(T)) {
 	struct counter_data {
 		unsigned int counter;
@@ -23,5 +23,5 @@
 	};
 
-	forall(ttype Args | { void ?{}(T &, Args); })
+	forall(Args... | { void ?{}(T &, Args); })
 	void ?{}(counter_data(T) & this, Args args);
 
@@ -31,5 +31,5 @@
 
 // This is one of many pointers keeping this alive.
-forall(dtype T | sized(T)) {
+forall(T & | sized(T)) {
 	struct counter_ptr {
 		counter_data(T) * data;
@@ -40,5 +40,5 @@
 	forall( | { void ^?{}(T &); })
 	void ?{}(counter_ptr(T) & this, counter_ptr(T) that);
-	forall(ttype Args | { void ?{}(T&, Args); })
+	forall(Args... | { void ?{}(T&, Args); })
 	void ?{}(counter_ptr(T) & this, Args args);
 
@@ -60,5 +60,5 @@
 
 // This is the only pointer that keeps this alive.
-forall(dtype T) {
+forall(T &) {
 	struct unique_ptr {
 		T * data;
@@ -68,5 +68,5 @@
 	void ?{}(unique_ptr(T) & this, zero_t);
 	void ?{}(unique_ptr(T) & this, unique_ptr(T) that) = void;
-	forall( | sized(T), ttype Args | { void ?{}(T &, Args); })
+	forall( | sized(T), Args... | { void ?{}(T &, Args); })
 	void ?{}(unique_ptr(T) & this, Args args);
 
Index: libcfa/src/parseargs.hfa
===================================================================
--- libcfa/src/parseargs.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/parseargs.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -14,5 +14,5 @@
 static inline void ?{}( cfa_option & this ) {}
 
-forall(dtype T | { bool parse(const char *, T & ); })
+forall(T & | { bool parse(const char *, T & ); })
 static inline void ?{}( cfa_option & this, char short_name, const char * long_name, const char * help, T & variable ) {
       this.val        = 0;
@@ -24,5 +24,5 @@
 }
 
-forall(dtype T)
+forall(T &)
 static inline void ?{}( cfa_option & this, char short_name, const char * long_name, const char * help, T & variable, bool (*parse)(const char *, T & )) {
       this.val        = 0;
Index: libcfa/src/rational.cfa
===================================================================
--- libcfa/src/rational.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/rational.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,5 +18,5 @@
 #include "stdlib.hfa"
 
-forall( otype RationalImpl | arithmetic( RationalImpl ) ) {
+forall( RationalImpl | arithmetic( RationalImpl ) ) {
 	// helper routines
 
@@ -159,5 +159,5 @@
 	// I/O
 
-	forall( dtype istype | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
+	forall( istype & | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
 	istype & ?|?( istype & is, Rational(RationalImpl) & r ) {
 		is | r.numerator | r.denominator;
@@ -168,5 +168,5 @@
 	} // ?|?
 
-	forall( dtype ostype | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
+	forall( ostype & | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
 		ostype & ?|?( ostype & os, Rational(RationalImpl) r ) {
 			return os | r.numerator | '/' | r.denominator;
@@ -179,5 +179,5 @@
 } // distribution
 
-forall( otype RationalImpl | arithmetic( RationalImpl ) | { RationalImpl ?\?( RationalImpl, unsigned long ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { RationalImpl ?\?( RationalImpl, unsigned long ); } )
 Rational(RationalImpl) ?\?( Rational(RationalImpl) x, long int y ) {
 	if ( y < 0 ) {
@@ -190,10 +190,10 @@
 // conversion
 
-forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
 double widen( Rational(RationalImpl) r ) {
  	return convert( r.numerator ) / convert( r.denominator );
 } // widen
 
-forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); RationalImpl convert( double ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); RationalImpl convert( double ); } )
 Rational(RationalImpl) narrow( double f, RationalImpl md ) {
 	// http://www.ics.uci.edu/~eppstein/numth/frap.c
Index: libcfa/src/rational.hfa
===================================================================
--- libcfa/src/rational.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/rational.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -20,8 +20,8 @@
 #include "iostream.hfa"
 
-trait scalar( otype T ) {
+trait scalar( T ) {
 };
 
-trait arithmetic( otype T | scalar( T ) ) {
+trait arithmetic( T | scalar( T ) ) {
 	int !?( T );
 	int ?==?( T, T );
@@ -46,5 +46,5 @@
 // implementation
 
-forall( otype RationalImpl | arithmetic( RationalImpl ) ) {
+forall( RationalImpl | arithmetic( RationalImpl ) ) {
 	struct Rational {
 		RationalImpl numerator, denominator;			// invariant: denominator > 0
@@ -89,8 +89,8 @@
 
 	// I/O
-	forall( dtype istype | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
+	forall( istype & | istream( istype ) | { istype & ?|?( istype &, RationalImpl & ); } )
 	istype & ?|?( istype &, Rational(RationalImpl) & );
 
-	forall( dtype ostype | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
+	forall( ostype & | ostream( ostype ) | { ostype & ?|?( ostype &, RationalImpl ); } ) {
 		ostype & ?|?( ostype &, Rational(RationalImpl) );
 		void ?|?( ostype &, Rational(RationalImpl) );
@@ -98,11 +98,11 @@
 } // distribution
 
-forall( otype RationalImpl | arithmetic( RationalImpl ) |{RationalImpl ?\?( RationalImpl, unsigned long );} )
+forall( RationalImpl | arithmetic( RationalImpl ) |{RationalImpl ?\?( RationalImpl, unsigned long );} )
 Rational(RationalImpl) ?\?( Rational(RationalImpl) x, long int y );
 
 // conversion
-forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl ); } )
 double widen( Rational(RationalImpl) r );
-forall( otype RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl );  RationalImpl convert( double );} )
+forall( RationalImpl | arithmetic( RationalImpl ) | { double convert( RationalImpl );  RationalImpl convert( double );} )
 Rational(RationalImpl) narrow( double f, RationalImpl md );
 
Index: libcfa/src/stdlib.cfa
===================================================================
--- libcfa/src/stdlib.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/stdlib.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -28,5 +28,5 @@
 // Cforall allocation/deallocation and constructor/destructor, array types
 
-forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } )
+forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } )
 T * anew( size_t dim, TT p ) {
 	T * arr = alloc( dim );
@@ -37,5 +37,5 @@
 } // anew
 
-forall( dtype T | sized(T) | { void ^?{}( T & ); } )
+forall( T & | sized(T) | { void ^?{}( T & ); } )
 void adelete( T arr[] ) {
 	if ( arr ) {										// ignore null
@@ -48,5 +48,5 @@
 } // adelete
 
-forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype TT | { void adelete( TT ); } )
+forall( T & | sized(T) | { void ^?{}( T & ); }, TT... | { void adelete( TT ); } )
 void adelete( T arr[], TT rest ) {
 	if ( arr ) {										// ignore null
@@ -97,5 +97,5 @@
 //---------------------------------------
 
-forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
 	E * bsearch( E key, const E * vals, size_t dim ) {
 		int cmp( const void * t1, const void * t2 ) {
@@ -156,5 +156,5 @@
 
 
-forall( otype K, otype E | { int ?<?( K, K ); K getKey( const E & ); } ) {
+forall( K, E | { int ?<?( K, K ); K getKey( const E & ); } ) {
 	E * bsearch( K key, const E * vals, size_t dim ) {
 		int cmp( const void * t1, const void * t2 ) {
Index: libcfa/src/stdlib.hfa
===================================================================
--- libcfa/src/stdlib.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/stdlib.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -10,6 +10,6 @@
 // Created On       : Thu Jan 28 17:12:35 2016
 // Last Modified By : Peter A. Buhr
-// Last Modified On : Sat Dec 12 13:52:34 2020
-// Update Count     : 536
+// Last Modified On : Mon Jan 18 21:51:13 2021
+// Update Count     : 569
 //
 
@@ -48,5 +48,5 @@
 	else return (T *)alignment( _Alignof(T), dim, sizeof(T) )
 
-static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
 	// CFA safe equivalents, i.e., implicit size specification
 
@@ -108,5 +108,5 @@
 
 	1. Replace the current forall-block that contains defintions of S_fill and S_realloc with following:
-		forall( dtype T | sized(T) ) {
+		forall( T & | sized(T) ) {
 			union  U_fill 		{ char c; T * a; T t; };
 			struct S_fill 		{ char tag; U_fill(T) fill; };
@@ -151,5 +151,5 @@
 typedef struct S_resize			{ inline void *;  }	T_resize;
 
-forall( dtype T ) {
+forall( T & ) {
 	struct S_fill 		{ char tag; char c; size_t size; T * at; char t[50]; };
 	struct S_realloc	{ inline T *; };
@@ -159,9 +159,11 @@
 static inline T_resize 	?`resize  ( void * a )	{ return (T_resize){a}; }
 
-static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
 	S_fill(T) ?`fill ( T t ) {
 		S_fill(T) ret = { 't' };
 		size_t size = sizeof(T);
-		if(size > sizeof(ret.t)) { printf("ERROR: const object of size greater than 50 bytes given for dynamic memory fill\n"); exit(1); }
+		if ( size > sizeof(ret.t) ) {
+			abort( "ERROR: const object of size greater than 50 bytes given for dynamic memory fill\n" );
+		} // if
 		memcpy( &ret.t, &t, size );
 		return ret;
@@ -173,5 +175,5 @@
 	S_realloc(T) 	?`realloc ( T * a )				{ return (S_realloc(T)){a}; }
 
-	T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill) {
+	T * $alloc_internal( void * Resize, T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill ) {
 		T * ptr = NULL;
 		size_t size = sizeof(T);
@@ -181,28 +183,23 @@
 			ptr = (T*) (void *) resize( (void *)Resize, Align, Dim * size );
 		} else if ( Realloc ) {
-			if (Fill.tag != '0') copy_end = min(malloc_size( Realloc ), Dim * size);
-			ptr = (T*) (void *) realloc( (void *)Realloc, Align, Dim * size );
+			if ( Fill.tag != '0' ) copy_end = min(malloc_size( Realloc ), Dim * size );
+			ptr = (T *) (void *) realloc( (void *)Realloc, Align, Dim * size );
 		} else {
-			ptr = (T*) (void *) memalign( Align, Dim * size );
-		}
-
-		if(Fill.tag == 'c') {
+			ptr = (T *) (void *) memalign( Align, Dim * size );
+		}
+
+		if ( Fill.tag == 'c' ) {
 			memset( (char *)ptr + copy_end, (int)Fill.c, Dim * size - copy_end );
-		} else if(Fill.tag == 't') {
+		} else if ( Fill.tag == 't' ) {
 			for ( int i = copy_end; i < Dim * size; i += size ) {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstringop-overflow="
-				memcpy( (char *)ptr + i, &Fill.t, size );
-#pragma GCC diagnostic pop
-#pragma GCC diagnostic pop
+				#pragma GCC diagnostic push
+				#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+				memcpy( (char *)ptr + i, &Fill.t, sizeof(Fill.t) );
+				#pragma GCC diagnostic pop
 			}
-		} else if(Fill.tag == 'a') {
+		} else if ( Fill.tag == 'a' ) {
 			memcpy( (char *)ptr + copy_end, Fill.at, min(Dim * size - copy_end, Fill.size) );
-		} else if(Fill.tag == 'T') {
-			for ( int i = copy_end; i < Dim * size; i += size ) {
-				memcpy( (char *)ptr + i, Fill.at, size );
-			}
+		} else if ( Fill.tag == 'T' ) {
+			memcpy( (char *)ptr + copy_end, Fill.at, Dim * size );
 		}
 
@@ -210,5 +207,5 @@
 	} // $alloc_internal
 
-	forall( ttype TT | { T * $alloc_internal( void *, T *, size_t, size_t, S_fill(T), TT ); } ) {
+	forall( TT... | { T * $alloc_internal( void *, T *, size_t, size_t, S_fill(T), TT ); } ) {
 
 		T * $alloc_internal( void *       , T * Realloc, size_t Align, size_t Dim, S_fill(T) Fill, T_resize Resize, TT rest) {
@@ -239,5 +236,5 @@
 } // distribution T
 
-static inline forall( dtype T | sized(T) ) {
+static inline forall( T & | sized(T) ) {
 	// CFA safe initialization/copy, i.e., implicit size specification, non-array types
 	T * memset( T * dest, char fill ) {
@@ -260,9 +257,9 @@
 
 // CFA deallocation for multiple objects
-static inline forall( dtype T )							// FIX ME, problems with 0p in list
+static inline forall( T & )							// FIX ME, problems with 0p in list
 void free( T * ptr ) {
 	free( (void *)ptr );								// C free
 } // free
-static inline forall( dtype T, ttype TT | { void free( TT ); } )
+static inline forall( T &, TT... | { void free( TT ); } )
 void free( T * ptr, TT rest ) {
 	free( ptr );
@@ -271,10 +268,10 @@
 
 // CFA allocation/deallocation and constructor/destructor, non-array types
-static inline forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } )
+static inline forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } )
 T * new( TT p ) {
-	return &(*(T *)malloc()){ p };							// run constructor
+	return &(*(T *)malloc()){ p };						// run constructor
 } // new
 
-static inline forall( dtype T | { void ^?{}( T & ); } )
+static inline forall( T & | { void ^?{}( T & ); } )
 void delete( T * ptr ) {
 	// special case for 0-sized object => always call destructor
@@ -284,5 +281,5 @@
 	free( ptr );										// always call free
 } // delete
-static inline forall( dtype T, ttype TT | { void ^?{}( T & ); void delete( TT ); } )
+static inline forall( T &, TT... | { void ^?{}( T & ); void delete( TT ); } )
 void delete( T * ptr, TT rest ) {
 	delete( ptr );
@@ -291,7 +288,7 @@
 
 // CFA allocation/deallocation and constructor/destructor, array types
-forall( dtype T | sized(T), ttype TT | { void ?{}( T &, TT ); } ) T * anew( size_t dim, TT p );
-forall( dtype T | sized(T) | { void ^?{}( T & ); } ) void adelete( T arr[] );
-forall( dtype T | sized(T) | { void ^?{}( T & ); }, ttype TT | { void adelete( TT ); } ) void adelete( T arr[], TT rest );
+forall( T & | sized(T), TT... | { void ?{}( T &, TT ); } ) T * anew( size_t dim, TT p );
+forall( T & | sized(T) | { void ^?{}( T & ); } ) void adelete( T arr[] );
+forall( T & | sized(T) | { void ^?{}( T & ); }, TT... | { void adelete( TT ); } ) void adelete( T arr[], TT rest );
 
 //---------------------------------------
@@ -333,5 +330,5 @@
 //---------------------------------------
 
-forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
 	E * bsearch( E key, const E * vals, size_t dim );
 	size_t bsearch( E key, const E * vals, size_t dim );
@@ -342,5 +339,5 @@
 } // distribution
 
-forall( otype K, otype E | { int ?<?( K, K ); K getKey( const E & ); } ) {
+forall( K, E | { int ?<?( K, K ); K getKey( const E & ); } ) {
 	E * bsearch( K key, const E * vals, size_t dim );
 	size_t bsearch( K key, const E * vals, size_t dim );
@@ -351,5 +348,5 @@
 } // distribution
 
-forall( otype E | { int ?<?( E, E ); } ) {
+forall( E | { int ?<?( E, E ); } ) {
 	void qsort( E * vals, size_t dim );
 } // distribution
Index: libcfa/src/time.cfa
===================================================================
--- libcfa/src/time.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/time.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -31,5 +31,5 @@
 
 
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype & os, Duration dur ) with( dur ) {
 		(ostype &)(os | tn / TIMEGRAN);					// print seconds
@@ -136,5 +136,5 @@
 } // strftime
 
-forall( dtype ostype | ostream( ostype ) ) {
+forall( ostype & | ostream( ostype ) ) {
 	ostype & ?|?( ostype & os, Time time ) with( time ) {
 		char buf[32];									// at least 26
Index: libcfa/src/vec/vec.hfa
===================================================================
--- libcfa/src/vec/vec.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/vec/vec.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,35 +18,35 @@
 #include <math.hfa>
 
-trait fromint(otype T) {
+trait fromint(T) {
     void ?{}(T&, int);
 };
-trait zeroinit(otype T) {
+trait zeroinit(T) {
     void ?{}(T&, zero_t);
 };
-trait zero_assign(otype T) {
+trait zero_assign(T) {
     T ?=?(T&, zero_t);
 };
-trait subtract(otype T) {
+trait subtract(T) {
     T ?-?(T, T);
 };
-trait negate(otype T) {
+trait negate(T) {
     T -?(T);
 };
-trait add(otype T) {
+trait add(T) {
     T ?+?(T, T);
 };
-trait multiply(otype T) {
+trait multiply(T) {
     T ?*?(T, T);
 };
-trait divide(otype T) {
+trait divide(T) {
     T ?/?(T, T);
 };
-trait lessthan(otype T) {
+trait lessthan(T) {
     int ?<?(T, T);
 };
-trait equality(otype T) {
+trait equality(T) {
     int ?==?(T, T);
 };
-trait sqrt(otype T) {
+trait sqrt(T) {
     T sqrt(T);
 };
@@ -68,5 +68,5 @@
 }
 
-trait dottable(otype V, otype T) {
+trait dottable(V, T) {
     T dot(V, V);
 };
@@ -74,20 +74,20 @@
 static inline {
 
-forall(otype T | sqrt(T), otype V | dottable(V, T))
+forall(T | sqrt(T), V | dottable(V, T))
 T length(V v) {
    return sqrt(dot(v, v));
 }
 
-forall(otype T, otype V | dottable(V, T))
+forall(T, V | dottable(V, T))
 T length_squared(V v) {
    return dot(v, v);
 }
 
-forall(otype T, otype V | { T length(V); } | subtract(V))
+forall(T, V | { T length(V); } | subtract(V))
 T distance(V v1, V v2) {
     return length(v1 - v2);
 }
 
-forall(otype T, otype V | { T length(V); V ?/?(V, T); })
+forall(T, V | { T length(V); V ?/?(V, T); })
 V normalize(V v) {
     return v / length(v);
@@ -95,5 +95,5 @@
 
 // Project vector u onto vector v
-forall(otype T, otype V | dottable(V, T) | { V normalize(V); V ?*?(V, T); })
+forall(T, V | dottable(V, T) | { V normalize(V); V ?*?(V, T); })
 V project(V u, V v) {
     V v_norm = normalize(v);
@@ -102,5 +102,5 @@
 
 // Reflect incident vector v with respect to surface with normal n
-forall(otype T | fromint(T), otype V | { V project(V, V); V ?*?(T, V); V ?-?(V,V); })
+forall(T | fromint(T), V | { V project(V, V); V ?*?(T, V); V ?-?(V,V); })
 V reflect(V v, V n) {
     return v - (T){2} * project(v, n);
@@ -111,6 +111,6 @@
 // entering material (i.e., from air to water, eta = 1/1.33)
 // v and n must already be normalized
-forall(otype T | fromint(T) | subtract(T) | multiply(T) | add(T) | lessthan(T) | sqrt(T),
-       otype V | dottable(V, T) | { V ?*?(T, V); V ?-?(V,V); void ?{}(V&, zero_t); })
+forall(T | fromint(T) | subtract(T) | multiply(T) | add(T) | lessthan(T) | sqrt(T),
+       V | dottable(V, T) | { V ?*?(T, V); V ?-?(V,V); void ?{}(V&, zero_t); })
 V refract(V v, V n, T eta) {
     T dotValue = dot(n, v);
@@ -128,5 +128,5 @@
 // i is the incident vector
 // ng is the geometric normal of the surface
-forall(otype T | lessthan(T) | zeroinit(T), otype V | dottable(V, T) | negate(V))
+forall(T | lessthan(T) | zeroinit(T), V | dottable(V, T) | negate(V))
 V faceforward(V n, V i, V ng) {
     return dot(ng, i) < (T){0} ? n : -n;
Index: libcfa/src/vec/vec2.hfa
===================================================================
--- libcfa/src/vec/vec2.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/vec/vec2.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -19,5 +19,5 @@
 #include "vec.hfa"
 
-forall (otype T) {
+forall (T) {
     struct vec2 {
         T x, y;
@@ -25,5 +25,5 @@
 }
 
-forall (otype T) {
+forall (T) {
     static inline {
 
@@ -279,5 +279,5 @@
 }
 
-forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec2(T) v) with (v) {
         return os | '<' | x | ',' | y | '>';
Index: libcfa/src/vec/vec3.hfa
===================================================================
--- libcfa/src/vec/vec3.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/vec/vec3.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -19,5 +19,5 @@
 #include "vec.hfa"
 
-forall (otype T) {
+forall (T) {
     struct vec3 {
         T x, y, z;
@@ -25,5 +25,5 @@
 }
 
-forall (otype T) {
+forall (T) {
     static inline {
 
@@ -288,5 +288,5 @@
 }
 
-forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec3(T) v) with (v) {
         return os | '<' | x | ',' | y | ',' | z | '>';
Index: libcfa/src/vec/vec4.hfa
===================================================================
--- libcfa/src/vec/vec4.hfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ libcfa/src/vec/vec4.hfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -19,5 +19,5 @@
 #include "vec.hfa"
 
-forall (otype T) {
+forall (T) {
     struct vec4 {
         T x, y, z, w;
@@ -25,5 +25,5 @@
 }
 
-forall (otype T) {
+forall (T) {
     static inline {
 
@@ -283,5 +283,5 @@
 }
 
-forall(dtype ostype, otype T | writeable(T, ostype)) {
+forall(ostype &, T | writeable(T, ostype)) {
     ostype & ?|?(ostype & os, vec4(T) v) with (v) {
         return os | '<' | x | ',' | y | ',' | z | ',' | w | '>';
Index: src/Parser/parser.yy
===================================================================
--- src/Parser/parser.yy	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ src/Parser/parser.yy	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -2441,5 +2441,9 @@
 type_parameter:											// CFA
 	type_class identifier_or_type_name
-		{ typedefTable.addToScope( *$2, TYPEDEFname, "9" ); }
+		{   typedefTable.addToScope( *$2, TYPEDEFname, "9" );
+			if ( $1 == TypeDecl::Otype ) { SemanticError( yylloc, "otype keyword is deprecated" ); }
+			if ( $1 == TypeDecl::Dtype ) { SemanticError( yylloc, "dtype keyword is deprecated" ); }
+			if ( $1 == TypeDecl::Ttype ) { SemanticError( yylloc, "ttype keyword is deprecated" ); }
+		}
 	  type_initializer_opt assertion_list_opt
 		{ $$ = DeclarationNode::newTypeParam( $1, $2 )->addTypeInitializer( $4 )->addAssertions( $5 ); }
Index: src/ResolvExpr/PolyCost.cc
===================================================================
--- src/ResolvExpr/PolyCost.cc	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ src/ResolvExpr/PolyCost.cc	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -35,5 +35,5 @@
 		PassVisitor<PolyCost> coster( env, indexer );
 		type->accept( coster );
-		return coster.pass.result;
+		return (coster.pass.result > 0) ? 1 : 0;
 	}
 
@@ -87,5 +87,5 @@
 	ast::Pass<PolyCost_new> costing( symtab, env );
 	type->accept( costing );
-	return costing.core.result;
+	return (costing.core.result > 0) ? 1 : 0;
 }
 
Index: src/ResolvExpr/SpecCost.cc
===================================================================
--- src/ResolvExpr/SpecCost.cc	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ src/ResolvExpr/SpecCost.cc	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -43,4 +43,7 @@
 		// mark specialization of base type
 		void postvisit(ReferenceType*) { if ( count >= 0 ) ++count; }
+
+		void postvisit(StructInstType*) { if ( count >= 0 ) ++count; }
+		void postvisit(UnionInstType*) { if ( count >= 0 ) ++count; }
 
 	private:
@@ -82,5 +85,4 @@
 		void previsit(StructInstType* sty) {
 			count = minover( sty->parameters );
-			visit_children = false;
 		}
 
@@ -88,5 +90,4 @@
 		void previsit(UnionInstType* uty) {
 			count = minover( uty->parameters );
-			visit_children = false;
 		}
 
@@ -174,4 +175,7 @@
 		void postvisit( const ast::ArrayType * ) { if ( count >= 0 ) ++count; }
 		void postvisit( const ast::ReferenceType * ) { if ( count >= 0 ) ++count; }
+
+		void postvisit( const ast::StructInstType * ) { if ( count >= 0 ) ++count; }
+		void postvisit( const ast::UnionInstType * ) { if ( count >= 0 ) ++count; }
 
 		// Use the minimal specialization value over returns and params.
@@ -189,5 +193,4 @@
 		void previsit( const ast::StructInstType * sty ) {
 			count = minimumPresent( sty->params, expr_result );
-			visit_children = false;
 		}
 
@@ -195,5 +198,4 @@
 		void previsit( const ast::UnionInstType * uty ) {
 			count = minimumPresent( uty->params, expr_result );
-			visit_children = false;
 		}
 
Index: tests/.expect/poly-selection.txt
===================================================================
--- tests/.expect/poly-selection.txt	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
+++ tests/.expect/poly-selection.txt	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -0,0 +1,5 @@
+friending generically
+friending specifically
+-
+f-generic
+f-specific
Index: tests/avltree/avl-private.cfa
===================================================================
--- tests/avltree/avl-private.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/avltree/avl-private.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -11,5 +11,5 @@
 // an AVL tree's height is easy to compute
 // just follow path with the larger balance
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int height(tree(K, V) * t){
   int helper(tree(K, V) * t, int ht){
@@ -27,5 +27,5 @@
 }
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int calcBalance(tree(K, V) * t){
   int l = height(t->left);
@@ -36,5 +36,5 @@
 
 // re-establish the link between parent and child
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void relinkToParent(tree(K, V) * t){
   tree(K, V) * parent = t->parent; // FIX ME!!
@@ -49,5 +49,5 @@
 
 // rotate left from t
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * rotateLeft(tree(K, V) * t){
   tree(K, V) * newRoot = t->right;
@@ -68,5 +68,5 @@
 
 // rotate right from t
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * rotateRight(tree(K, V) * t){
   tree(K, V) * newRoot = t->left;
@@ -87,5 +87,5 @@
 
 // balances a node that has balance factor -2 or 2
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * fix(tree(K, V) * t){
   // ensure that t's balance factor is one of
@@ -113,5 +113,5 @@
 
 // attempt to fix the tree, if necessary
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * tryFix(tree(K, V) * t){
   int b = calcBalance(t);
@@ -126,5 +126,5 @@
 
 // sets parent field of c to be p
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void setParent(tree(K, V) * c, tree(K, V) * p){
   if (! empty(c)){
Index: tests/avltree/avl-private.h
===================================================================
--- tests/avltree/avl-private.h	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/avltree/avl-private.h	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -5,11 +5,11 @@
 
 // attempt to fix the tree, if necessary
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * tryFix(tree(K, V) * t);
 
 // sets parent field of c to be p
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void setParent(tree(K, V) * c, tree(K, V) * p);
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int height(tree(K, V) * t);
Index: tests/avltree/avl.h
===================================================================
--- tests/avltree/avl.h	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/avltree/avl.h	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -9,12 +9,12 @@
 // #include <lib.h>
 
-trait Comparable(otype T) {
+trait Comparable(T) {
   int ?<?(T, T);
 };
 
-forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?==?(T t1, T t2);
 
-forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?>?(T t1, T t2);
 
@@ -41,8 +41,8 @@
 
 // temporary: need forward decl to get around typedef problem
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 struct tree;
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 struct tree {
   K key;
@@ -54,30 +54,30 @@
 };
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void ?{}(tree(K, V) &t, K key, V value);
 
-forall(otype K, otype V)
+forall(K | Comparable(K), V)
 void ^?{}(tree(K, V) & t);
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * create(K key, V value);
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 V * find(tree(K, V) * t, K key);
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int empty(tree(K, V) * t);
 
 // returns the root of the tree
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int insert(tree(K, V) ** t, K key, V value);
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int remove(tree(K, V) ** t, K key);
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void copy(tree(K, V) * src, tree(K, V) ** ret);
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void for_each(tree(K, V) * t, void (*func)(V));
 
Index: tests/avltree/avl0.cfa
===================================================================
--- tests/avltree/avl0.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/avltree/avl0.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,10 +1,10 @@
 #include "avl.h"
 
-forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?==?(T t1, T t2) {
   return !(t1 < t2) && !(t2 < t1);
 }
 
-forall(otype T | Comparable(T))
+forall(T | Comparable(T))
 int ?>?(T t1, T t2) {
   return t2 < t1;
Index: tests/avltree/avl1.cfa
===================================================================
--- tests/avltree/avl1.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/avltree/avl1.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -3,5 +3,5 @@
 #include <stdlib.hfa>
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void ?{}(tree(K, V) &t, K key, V value){
   (t.key) { key };
@@ -13,5 +13,5 @@
 }
 
-forall(otype K, otype V)
+forall(K| Comparable(K), V)
 void ^?{}(tree(K, V) & t){
   delete(t.left);
@@ -21,5 +21,5 @@
 }
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * create(K key, V value) {
   // infinite loop trying to resolve ... t = malloc();
Index: tests/avltree/avl2.cfa
===================================================================
--- tests/avltree/avl2.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/avltree/avl2.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -2,5 +2,5 @@
 #include "avl-private.h"
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 V * find(tree(K, V) * t, K key){
   if (empty(t)){
@@ -18,5 +18,5 @@
 }
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int empty(tree(K, V) * t){
   return t == NULL;
@@ -24,5 +24,5 @@
 
 // returns the root of the tree
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int insert(tree(K, V) ** t, K key, V value) {
   // handles a non-empty tree
Index: tests/avltree/avl3.cfa
===================================================================
--- tests/avltree/avl3.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/avltree/avl3.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -4,5 +4,5 @@
 
 // swaps the data within two tree nodes
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void node_swap(tree(K, V) * t, tree(K, V) * t2){
 	swap( t->key,  t2->key);
@@ -11,5 +11,5 @@
 
 // go left as deep as possible from within the right subtree
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * find_successor(tree(K, V) * t){
 	tree(K, V) * find_successor_helper(tree(K, V) * t){
@@ -25,5 +25,5 @@
 
 // cleanup - don't want to deep delete, so set children to NULL first.
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void deleteSingleNode(tree(K, V) * t) {
 	t->left = NULL;
@@ -33,5 +33,5 @@
 
 // does the actual remove operation once we've found the node in question
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * remove_node(tree(K, V) * t){
 	// is the node a leaf?
@@ -85,5 +85,5 @@
 
 // finds the node that needs to be removed
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 tree(K, V) * remove_helper(tree(K, V) * t, K key, int * worked){
 	if (empty(t)){
@@ -106,5 +106,5 @@
 }
 
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int remove(tree(K, V) ** t, K key){
 	int worked = 0;
Index: tests/avltree/avl4.cfa
===================================================================
--- tests/avltree/avl4.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/avltree/avl4.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -4,5 +4,5 @@
 // Perform a shallow copy of src, return the
 // new tree in ret
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 int copy(tree(K, V) * src, tree(K, V) ** ret){
   tree(K, V) * helper(tree(K, V) * t, int * worked){
@@ -35,5 +35,5 @@
 
 // Apply func to every value element in t, using an in order traversal
-forall(otype K | Comparable(K), otype V)
+forall(K | Comparable(K), V)
 void for_each(tree(K, V) * t, int (*func)(V)) {
   if (t == NULL) {
Index: tests/bugs/10.cfa
===================================================================
--- tests/bugs/10.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/bugs/10.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -2,5 +2,5 @@
 // https://cforall.uwaterloo.ca/trac/ticket/10
 
-forall(otype T)
+forall(T)
 struct result {
       union {
Index: tests/bugs/104.cfa
===================================================================
--- tests/bugs/104.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/bugs/104.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -4,5 +4,5 @@
 [ float, float ] modf_( float x );
 
-forall(otype T | { [T, T] modf_(T); })
+forall(T | { [T, T] modf_(T); })
 void modf(T);
 
Index: tests/bugs/194.cfa
===================================================================
--- tests/bugs/194.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/bugs/194.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -2,9 +2,9 @@
 // https://cforall.uwaterloo.ca/trac/ticket/194
 
-forall( dtype T | sized(T) ) T * foo( void ) {
+forall( T & | sized(T) ) T * foo( void ) {
       printf( "foo1\n" );
 	return (T *)0;
 }
-forall( dtype T | sized(T) ) T & foo( void ) {
+forall( T & | sized(T) ) T & foo( void ) {
 	printf( "foo2\n" );
 	return (T &)*(T *)0;
Index: tests/bugs/196.cfa
===================================================================
--- tests/bugs/196.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/bugs/196.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -2,8 +2,8 @@
 // https://cforall.uwaterloo.ca/trac/ticket/196
 
-forall(dtype T)
+forall(T &)
 struct link;
 
-forall(dtype T)
+forall(T &)
 struct link {
 	link(T) * next;
@@ -12,13 +12,13 @@
 // -----
 
-forall(dtype T)
+forall(T &)
 struct foo;
 
-forall(dtype U)
+forall(U &)
 struct bar {
 	foo(U) * data;
 };
 
-forall(dtype T)
+forall(T &)
 struct foo {};
 
Index: tests/bugs/203-2.cfa
===================================================================
--- tests/bugs/203-2.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/bugs/203-2.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,10 +1,10 @@
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 
-forall(dtype A)
+forall(A &)
 struct empty {
 	// Nothing.
 };
 
-forall(dtype C)
+forall(C &)
 struct wrap_e {
 	empty(C) field;
Index: tests/bugs/203-7.cfa
===================================================================
--- tests/bugs/203-7.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/bugs/203-7.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,10 +1,10 @@
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 
-forall(dtype A)
+forall(A &)
 struct empty {
 	// Nothing.
 };
 
-forall(dtype C)
+forall(C &)
 struct wrap_e {
 	empty(C) field;
Index: tests/bugs/203-9.cfa
===================================================================
--- tests/bugs/203-9.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/bugs/203-9.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,10 +1,10 @@
 // Trac ticket: https://cforall.uwaterloo.ca/trac/ticket/203
 
-forall(dtype A)
+forall(A &)
 struct empty {
 	// Nothing.
 };
 
-forall(dtype C)
+forall(C &)
 struct wrap_e {
 	empty(C) field;
Index: tests/bugs/7.cfa
===================================================================
--- tests/bugs/7.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/bugs/7.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -8,8 +8,8 @@
 
 // (Bug 1 unresolved as of this test.)
-forall(otype T)
+forall(T)
 struct stack_node;
 
-forall(otype T)
+forall(T)
 struct stack_node {
     stack_node(T) * next;
@@ -17,14 +17,14 @@
 };
 
-forall(otype T)
+forall(T)
 struct stack {
     stack_node(T) * head;
 };
 
-trait stack_errors(otype T) {
+trait stack_errors(T) {
     T emptyStackHandler (stack(T) * this);
 };
 
-forall(otype T | stack_errors(T))
+forall(T | stack_errors(T))
 T pop (stack(T) * this) {
     return (T){};
Index: tests/castError.cfa
===================================================================
--- tests/castError.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/castError.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -14,5 +14,5 @@
 // 
 
-forall(otype T) struct S { T p; };
+forall(T) struct S { T p; };
 int f;
 S(int) sint;
Index: tests/concurrent/examples/boundedBufferEXT.cfa
===================================================================
--- tests/concurrent/examples/boundedBufferEXT.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/concurrent/examples/boundedBufferEXT.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -24,5 +24,5 @@
 enum { BufferSize = 50 };
 
-forall( otype T ) {
+forall( T ) {
 	monitor Buffer {
 		int front, back, count;
Index: tests/concurrent/examples/boundedBufferINT.cfa
===================================================================
--- tests/concurrent/examples/boundedBufferINT.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/concurrent/examples/boundedBufferINT.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -24,5 +24,5 @@
 enum { BufferSize = 50 };
 
-forall( otype T ) {
+forall( T ) {
 	monitor Buffer {
 		condition full, empty;
Index: tests/concurrent/examples/quickSort.generic.cfa
===================================================================
--- tests/concurrent/examples/quickSort.generic.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/concurrent/examples/quickSort.generic.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -21,5 +21,5 @@
 #include <string.h>										// strcmp
 
-forall( otype T | { int ?<?( T, T ); } ) {
+forall( T | { int ?<?( T, T ); } ) {
 	thread Quicksort {
 		T * values;										// communication variables
Index: tests/concurrent/multi-monitor.cfa
===================================================================
--- tests/concurrent/multi-monitor.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/concurrent/multi-monitor.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -38,5 +38,5 @@
 }
 
-forall(dtype T | sized(T) | { void ^?{}(T & mutex); })
+forall(T & | sized(T) | { void ^?{}(T & mutex); })
 void delete_mutex(T * x) {
 	^(*x){};
Index: tests/concurrent/thread.cfa
===================================================================
--- tests/concurrent/thread.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/concurrent/thread.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,4 +1,5 @@
 #include <fstream.hfa>
 #include <kernel.hfa>
+#include <locks.hfa>
 #include <stdlib.hfa>
 #include <thread.hfa>
Index: tests/errors/completeType.cfa
===================================================================
--- tests/errors/completeType.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/errors/completeType.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,7 +1,7 @@
 void foo(int *) {}
 void bar(void *) {}
-forall(otype T) void baz(T *);
-forall(dtype T) void qux(T *);
-forall(dtype T | sized(T)) void quux(T *);
+forall(T) void baz(T *);
+forall(T &) void qux(T *);
+forall(T & | sized(T)) void quux(T *);
 
 struct A;	// incomplete
@@ -39,5 +39,5 @@
 
 
-forall(otype T)
+forall(T)
 void baz(T * x) {
 	// okay
@@ -49,5 +49,5 @@
 }
 
-forall(dtype T)
+forall(T &)
 void qux(T * y) {
 	// okay
@@ -61,5 +61,5 @@
 }
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 void quux(T * z) {
 	// okay
Index: tests/exceptions/defaults.cfa
===================================================================
--- tests/exceptions/defaults.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/exceptions/defaults.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -55,5 +55,5 @@
 
 void unhandled_test(void) {
-	forall(dtype T, dtype V | is_exception(T, V))
+	forall(T &, V & | is_exception(T, V))
 	void defaultTerminationHandler(T &) {
 		throw (unhandled_exception){};
Index: tests/exceptions/polymorphic.cfa
===================================================================
--- tests/exceptions/polymorphic.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/exceptions/polymorphic.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -3,6 +3,6 @@
 #include <exception.hfa>
 
-FORALL_TRIVIAL_EXCEPTION(proxy, (otype T), (T));
-FORALL_TRIVIAL_INSTANCE(proxy, (otype U), (U))
+FORALL_TRIVIAL_EXCEPTION(proxy, (T), (T));
+FORALL_TRIVIAL_INSTANCE(proxy, (U), (U))
 
 const char * msg(proxy(int) * this) { return "proxy(int)"; }
@@ -33,9 +33,9 @@
 }
 
-FORALL_DATA_EXCEPTION(cell, (otype T), (T))(
+FORALL_DATA_EXCEPTION(cell, (T), (T))(
 	T data;
 );
 
-FORALL_DATA_INSTANCE(cell, (otype T), (T))
+FORALL_DATA_INSTANCE(cell, (T), (T))
 
 const char * msg(cell(int) * this) { return "cell(int)"; }
Index: tests/exceptions/virtual-poly.cfa
===================================================================
--- tests/exceptions/virtual-poly.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/exceptions/virtual-poly.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,10 +16,10 @@
 };
 
-forall(otype T)
+forall(T)
 struct mono_child_vtable {
 	mono_base_vtable const * const parent;
 };
 
-forall(otype T)
+forall(T)
 struct mono_child {
 	mono_child_vtable(T) const * virtual_table;
@@ -37,20 +37,20 @@
 }
 
-forall(otype U)
+forall(U)
 struct poly_base_vtable {
 	poly_base_vtable(U) const * const parent;
 };
 
-forall(otype U)
+forall(U)
 struct poly_base {
 	poly_base_vtable(U) const * virtual_table;
 };
 
-forall(otype V)
+forall(V)
 struct poly_child_vtable {
 	poly_base_vtable(V) const * const parent;
 };
 
-forall(otype V)
+forall(V)
 struct poly_child {
 	poly_child_vtable(V) const * virtual_table;
Index: tests/forall.cfa
===================================================================
--- tests/forall.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/forall.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -15,5 +15,5 @@
 
 void g1() {
-	forall( otype T ) T f( T ) {};
+	forall( T ) T f( T ) {};
 	void f( int ) {};
 	void h( void (*p)(void) ) {};
@@ -32,6 +32,6 @@
 
 void g2() {
-	forall( otype T ) void f( T, T ) {}
-	forall( otype T, otype U ) void f( T, U ) {}
+	forall( T ) void f( T, T ) {}
+	forall( T, U ) void f( T, U ) {}
 
 	int x;
@@ -45,7 +45,7 @@
 }
 
-typedef forall ( otype T ) int (* f)( int );
-
-forall( otype T )
+typedef forall ( T ) int (* f)( int );
+
+forall( T )
 void swap( T left, T right ) {
 	T temp = left;
@@ -54,5 +54,5 @@
 }
 
-trait sumable( otype T ) {
+trait sumable( T ) {
 	void ?{}( T &, zero_t );							// 0 literal constructor
 	T ?+?( T, T );										// assortment of additions
@@ -62,5 +62,5 @@
 }; // sumable
 
-forall( otype T | sumable( T ) )						// use trait
+forall( T | sumable( T ) )						// use trait
 T sum( size_t size, T a[] ) {
 	T total = 0;										// initialize by 0 constructor
@@ -70,10 +70,10 @@
 } // sum
 
-forall( otype T | { T ?+?( T, T ); T ?++( T & ); [T] ?+=?( T &,T ); } )
+forall( T | { T ?+?( T, T ); T ?++( T & ); [T] ?+=?( T &,T ); } )
 T twice( T t ) {
 	return t + t;
 }
 
-forall( otype T | { int ?<?(T, T); } )
+forall( T | { int ?<?(T, T); } )
 T min( T t1, T t2 ) {
 	return t1 < t2 ? t1 : t2;
@@ -91,23 +91,23 @@
 
 // Multiple forall
-forall( otype T ) forall( otype S ) struct { int i; };
-forall( otype T ) struct { int i; } forall( otype S );
-struct { int i; } forall( otype T ) forall( otype S );
-forall( otype W ) struct { int i; } forall( otype T ) forall( otype S );
+forall( T ) forall( S ) struct { int i; };
+forall( T ) struct { int i; } forall( S );
+struct { int i; } forall( T ) forall( S );
+forall( W ) struct { int i; } forall( T ) forall( S );
 
 // Distribution
 struct P { int i; };
-forall( otype T ) struct Q { T i; };
-forall( otype T ) struct { int i; };
+forall( T ) struct Q { T i; };
+forall( T ) struct { int i; };
 struct KK { int i; };
 inline static {
  	void RT1() {}
 }
-forall( otype T ) {
+forall( T ) {
 	T RT2( T ) {
 		typedef int TD1;
 		struct S1 { T t; };
 	}
-	forall( otype X ) {
+	forall( X ) {
 		typedef int TD2;
 		struct S2 {};
@@ -117,5 +117,5 @@
 	}
 	extern "C" {
-		forall( otype W ) {
+		forall( W ) {
 			W RT3( W ) {}
 			struct S3 {};
@@ -123,5 +123,5 @@
 	}
 	void RT4() {
-		forall( otype W ) struct S4 {};
+		forall( W ) struct S4 {};
 		typedef int TD3;
 	}
@@ -147,22 +147,22 @@
 
 static inline {
-	forall( otype T ) {
+	forall( T ) {
 		int RT6( T p );
 	}
-	forall( otype T, otype U ) {
+	forall( T, U ) {
 		int RT7( T, U );
 	}
 }
-static forall( otype T ) {
+static forall( T ) {
 	int RT8( T );
 }
-forall( otype T ) inline static {
+forall( T ) inline static {
 	int RT9( T ) { T t; return 3; }
 }
 
-forall( otype T | { T ?+?( T, T ); } ) {
-	forall( otype S | { T ?+?( T, S ); } ) {
-		forall( otype W ) T bar( T t, S s ) { return t + s; }
-		forall( otype W | { W ?+?( T, W ); } ) W baz( T t, S s, W w ) { return t + s + w; }
+forall( T | { T ?+?( T, T ); } ) {
+	forall( S | { T ?+?( T, S ); } ) {
+		forall( W ) T bar( T t, S s ) { return t + s; }
+		forall( W | { W ?+?( T, W ); } ) W baz( T t, S s, W w ) { return t + s + w; }
 		struct W { T t; } (int,int) ww;
 		struct P pp;
@@ -170,18 +170,18 @@
 }
 
-forall( otype T | { T ?+?( T, T ); } ) forall( otype S | { T ?+?( T, S ); } ) 
+forall( T | { T ?+?( T, T ); } ) forall( S | { T ?+?( T, S ); } ) 
 struct XW { T t; };
 XW(int,int) xww;
 
-forall( otype T ) struct S { T t; } (int) x, y, z;
-forall( otype T ) struct { T t; } (int) a, b, c;
-
-forall( otype T ) static forall( otype S ) {
-    forall( otype X ) struct U {
+forall( T ) struct S { T t; } (int) x, y, z;
+forall( T ) struct { T t; } (int) a, b, c;
+
+forall( T ) static forall( S ) {
+    forall( X ) struct U {
 		T x;
     };
 }
 
-forall( otype T ) {
+forall( T ) {
 	extern "C" {
 		struct SS { T t; };
Index: tests/function-operator.cfa
===================================================================
--- tests/function-operator.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/function-operator.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -22,9 +22,9 @@
 
 // STL-like Algorithms
-trait Assignable(dtype T, dtype U) { T ?=?(T &, U); };
-trait Copyable(dtype T) { void ?{}(T &, T); };
-trait Destructable(dtype T) { void ^?{}(T &); };
+trait Assignable(T &, U &) { T ?=?(T &, U); };
+trait Copyable(T &) { void ?{}(T &, T); };
+trait Destructable(T &) { void ^?{}(T &); };
 
-trait Iterator(dtype iter | sized(iter) | Copyable(iter) | Destructable(iter), otype T) {
+trait Iterator(iter & | sized(iter) | Copyable(iter) | Destructable(iter), T) {
 	T & *?(iter);
 	iter ++?(iter &);
@@ -32,5 +32,5 @@
 };
 
-forall(otype Tin, dtype Input | Iterator(Input, Tin), otype Tout, dtype Output | Iterator(Output, Tout) | Assignable(Tout, Tin))
+forall(Tin, Input & | Iterator(Input, Tin), Tout, Output & | Iterator(Output, Tout) | Assignable(Tout, Tin))
 Output copy(Input first, Input last, Output result) {
 	while (first != last) {
@@ -42,5 +42,5 @@
 
 // test ?()(T *, ...) -- ?() with function call-by-pointer
-forall(otype Tin, dtype Input | Iterator(Input, Tin), otype Tout, dtype Output | Iterator(Output, Tout), otype FuncRet, dtype Func | { FuncRet ?()(Func *, Tin); } | Assignable(Tout, FuncRet))
+forall(Tin, Input & | Iterator(Input, Tin), Tout, Output & | Iterator(Output, Tout), FuncRet, Func & | { FuncRet ?()(Func *, Tin); } | Assignable(Tout, FuncRet))
 Output transform (Input first, Input last, Output result, Func * op) {
 	while (first != last) {
@@ -52,5 +52,5 @@
 
 // test ?()(T, ...) -- ?() with function call-by-value
-forall(dtype Iter, otype T | Iterator(Iter, T), otype Pred | { int ?()(Pred, T); })
+forall(Iter &, T | Iterator(Iter, T), Pred | { int ?()(Pred, T); })
 Iter find_if (Iter first, Iter last, Pred pred) {
 	while (first != last) {
@@ -62,5 +62,5 @@
 
 // test ?()(T, ...) -- ?() with function call-by-reference
-forall(otype Generator, otype GenRet | { GenRet ?()(Generator &); }, dtype Iter, otype T | Iterator(Iter, T) | Assignable(T, GenRet))
+forall(Generator, GenRet | { GenRet ?()(Generator &); }, Iter &, T | Iterator(Iter, T) | Assignable(T, GenRet))
 void generate(Iter first, Iter last, Generator & gen) {
 	int i = 0;
@@ -108,20 +108,20 @@
 }
 
-forall(otype T | { int ?==?(T, T); })
+forall(T | { int ?==?(T, T); })
 struct Equals {
 	T val;
 };
 
-forall(otype T | { int ?==?(T, T); })
+forall(T | { int ?==?(T, T); })
 int ?()(Equals(T) eq, T x) {
 	return eq.val == x;
 }
 
-forall(otype T | { T ?*?(T, T); })
+forall(T | { T ?*?(T, T); })
 struct Multiply {
 	T val;
 };
 
-forall(otype T | { T ?*?(T, T); })
+forall(T | { T ?*?(T, T); })
 T ?()(Multiply(T) * mult, T x) {
 	return mult->val * x;
@@ -130,5 +130,5 @@
 // TODO: generalize to ttype return; doesn't work yet
 // like std::function
-forall(otype Return, ttype Args)
+forall(Return, Args...)
 struct function {
 	Return (*f)(Args);
Index: tests/genericUnion.cfa
===================================================================
--- tests/genericUnion.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/genericUnion.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,5 +16,5 @@
 #include <limits.hfa>
 
-forall(otype T)
+forall(T)
 union ByteView {
 	T val;
@@ -22,5 +22,5 @@
 };
 
-forall(otype T)
+forall(T)
 void print(ByteView(T) x) {
 	for (int i = 0; i < sizeof(int); i++) {				// want to change to sizeof(T)
@@ -29,5 +29,5 @@
 }
 
-forall(otype T)
+forall(T)
 void f(ByteView(T) x, T val) {
 	print(x);
Index: tests/global-monomorph.cfa
===================================================================
--- tests/global-monomorph.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/global-monomorph.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,11 +1,11 @@
 // Create monomorphic instances of polymorphic types at global scope.
 
-forall(dtype T)
+forall(T &)
 void poly0(T &) {}
 
-forall(dtype T | sized(T))
+forall(T & | sized(T))
 void poly1(T &) {}
 
-forall(otype T)
+forall(T)
 void poly2(T &) {}
 
Index: tests/identity.cfa
===================================================================
--- tests/identity.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/identity.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,5 +16,5 @@
 #include <fstream.hfa>
 
-forall( otype T )
+forall( T )
 T identity( T t ) {
 	return t;
Index: tests/init1.cfa
===================================================================
--- tests/init1.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/init1.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -120,10 +120,10 @@
 }
 
-forall (dtype T, dtype S)
+forall (T &, S &)
 T & anycvt( S & s ) {
     return s;               // mismatched referenced type
 }
 
-forall (dtype T, dtype S)
+forall (T &, S &)
 T * anycvt( S * s ) {
     return s;               // mismatched referenced type
Index: tests/nested-types.cfa
===================================================================
--- tests/nested-types.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/nested-types.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,5 +16,5 @@
 typedef int N;
 struct A {
-	forall(otype T)
+	forall(T)
 	struct N {
 		T x;
Index: tests/poly-d-cycle.cfa
===================================================================
--- tests/poly-d-cycle.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/poly-d-cycle.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,13 +1,13 @@
 // Check that a cycle of polymorphic dtype structures can be instancated.
 
-forall(dtype T)
+forall(T &)
 struct func_table;
 
-forall(dtype U)
+forall(U &)
 struct object {
 	func_table(U) * virtual_table;
 };
 
-forall(dtype T)
+forall(T &)
 struct func_table {
 	void (*object_func)(object(T) *);
Index: tests/poly-o-cycle.cfa
===================================================================
--- tests/poly-o-cycle.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/poly-o-cycle.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,13 +1,13 @@
 // Check that a cycle of polymorphic otype structures can be instancated.
 
-forall(otype T)
+forall(T)
 struct func_table;
 
-forall(otype U)
+forall(U)
 struct object {
 	func_table(U) * virtual_table;
 };
 
-forall(otype T)
+forall(T)
 struct func_table {
 	void (*object_func)(object(T) *);
Index: tests/poly-selection.cfa
===================================================================
--- tests/poly-selection.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
+++ tests/poly-selection.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -0,0 +1,62 @@
+//
+// Cforall Version 1.0.0 Copyright (C) 2015 University of Waterloo
+//
+// The contents of this file are covered under the licence agreement in the
+// file "LICENCE" distributed with Cforall.
+//
+// poly-selection.cfa -- tests that show correct candidates selected, given interesting cases of
+//                       forall/overload polymoprphism
+//
+// Author           : Michael Brooks
+// Created On       : Mon Jan 18 15:00:00 2021
+// Last Modified By : Michael Brooks
+// Last Modified On : Mon Jan 18 15:00:00 2021
+// Update Count     : 1
+//
+
+void testSpecializationFromGenericOverBareTyvar() {
+    forall( T & )
+    void friend( T & ) {
+        printf("friending generically\n");
+    }
+
+    forall(T &)
+    struct thing {
+        int x;
+    };
+
+    forall( T & )
+    void friend( thing(T) & ) {
+        printf("friending specifically\n");
+    }
+
+    float x;           friend( x );
+    thing(float) y;    friend( y );
+}
+
+void testSpecializationFromGenericAccessibleWithExtraTyvars() {
+
+    forall( T &, U & )
+    struct map {};
+
+    forall( T & )
+    void f( T & ) {
+        printf("f-generic\n");
+    }
+
+    forall( T & )
+    void f( map(T, T) & ) {
+        printf("f-specific\n");
+    }
+
+    float one;
+    map(float, float) two;
+    f(one);
+    f(two);
+}
+
+int main() {
+    testSpecializationFromGenericOverBareTyvar();
+    printf("-\n");
+    testSpecializationFromGenericAccessibleWithExtraTyvars();
+}
Index: tests/polymorphism.cfa
===================================================================
--- tests/polymorphism.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/polymorphism.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,5 +18,5 @@
 #include <fstream.hfa>
 
-forall(otype T)
+forall(T)
 T f(T x, T y) {
 	x = y;
@@ -24,9 +24,9 @@
 }
 
-forall(otype T) T ident(T x) {
+forall(T) T ident(T x) {
 	return x;
 }
 
-forall( otype T, otype U )
+forall( T, U )
 size_t struct_size( T i, U j ) {
 	struct S { T i; U j; };
@@ -34,5 +34,5 @@
 }
 
-forall( otype T, otype U )
+forall( T, U )
 size_t union_size( T i, U j ) {
 	union B { T i; U j; };
@@ -41,5 +41,5 @@
 
 // perform some simple operations on aggregates of T and U
-forall( otype T | { void print(T); int ?==?(T, T); }, otype U | { void print(U); U ?=?(U&, zero_t); } )
+forall( T | { void print(T); int ?==?(T, T); }, U | { void print(U); U ?=?(U&, zero_t); } )
 U foo(T i, U j) {
 	struct S { T i; U j; };
Index: tests/raii/ctor-autogen.cfa
===================================================================
--- tests/raii/ctor-autogen.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/raii/ctor-autogen.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -33,5 +33,5 @@
 
 // dtype-static generic type is otype
-forall(dtype T)
+forall(T &)
 struct DtypeStaticStruct {
   T * data;
@@ -39,5 +39,5 @@
 };
 
-forall(dtype T)
+forall(T &)
 union DtypeStaticUnion {
   T * data;
@@ -46,10 +46,10 @@
 
 // dynamic generic type is otype
-forall(otype T)
+forall(T)
 struct DynamicStruct {
 	T x;
 };
 
-forall(otype T)
+forall(T)
 union DynamicUnion {
 	T x;
@@ -80,5 +80,5 @@
 
 
-forall(otype T)
+forall(T)
 T identity(T x) { return x; }
 
Index: tests/simpleGenericTriple.cfa
===================================================================
--- tests/simpleGenericTriple.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/simpleGenericTriple.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -14,10 +14,10 @@
 //
 
-forall(otype T)
+forall(T)
 struct T3 {
 	T f0, f1, f2;
 };
 
-forall(otype T | { T ?+?(T, T); })
+forall(T | { T ?+?(T, T); })
 T3(T) ?+?(T3(T) x, T3(T) y) {
 	T3(T) z = { x.f0+y.f0, x.f1+y.f1, x.f2+y.f2 };
Index: tests/sum.cfa
===================================================================
--- tests/sum.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/sum.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -18,5 +18,5 @@
 #include <stdlib.hfa>
 
-trait sumable( otype T ) {
+trait sumable( T ) {
 	void ?{}( T &, zero_t );							// 0 literal constructor
 	T ?+?( T, T );										// assortment of additions
@@ -26,5 +26,5 @@
 }; // sumable
 
-forall( otype T | sumable( T ) )						// use trait
+forall( T | sumable( T ) )						// use trait
 T sum( size_t size, T a[] ) {
 	T total = 0;										// initialize by 0 constructor
@@ -107,5 +107,5 @@
 		 | sum( size, (S *)a ) | ", check" | (S)s;
 
-	forall( otype Impl | sumable( Impl ) )
+	forall( Impl | sumable( Impl ) )
 	struct GS {
 		Impl * x, * y;
@@ -194,5 +194,5 @@
 		 sum( size, (S *)a ).[i, j], s.[i, j] );
 
-	forall( otype Impl | sumable( Impl ) )
+	forall( Impl | sumable( Impl ) )
 	struct GS {
 		Impl * x, * y;
Index: tests/tuple/tuplePolymorphism.cfa
===================================================================
--- tests/tuple/tuplePolymorphism.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/tuple/tuplePolymorphism.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -29,9 +29,9 @@
 // ensure that f is a viable candidate for g, even though its parameter structure does not exactly match
 [A] f([A, B] x, B y) { printf("%g %c %g %lld %c %lld %lld %c %lld\n", x.0.[x,y,z], x.1.[x,y,z], y.[x,y,z]); return x.0; }
-forall(otype T, otype U | { T f(T, U, U); })
+forall(T, U | { T f(T, U, U); })
 void g(T x, U y) { f(x, y, y); }
 
 // add two triples
-forall(otype T | { T ?+?(T, T); })
+forall(T | { T ?+?(T, T); })
 [T, T, T] ?+?([T, T, T] x, [T, T, T] y) {
 	return [x.0+y.0, x.1+y.1, x.2+y.2];
@@ -64,5 +64,5 @@
 }
 
-forall(otype T)
+forall(T)
 [T, T] foo([T, T] y) {
 	[T, T] x;
Index: tests/tuple/tupleVariadic.cfa
===================================================================
--- tests/tuple/tupleVariadic.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/tuple/tupleVariadic.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -19,5 +19,5 @@
 	printf("called func(void)\n");
 }
-forall(otype T, ttype Params | { void process(T); void func(Params); })
+forall(T, Params... | { void process(T); void func(Params); })
 void func(T arg1, Params p) {
 	process(arg1);
@@ -92,5 +92,5 @@
 }
 
-forall(otype T)
+forall(T)
 T * copy(T x) {
 	// test calling new inside a polymorphic function
@@ -98,5 +98,5 @@
 }
 
-forall(ttype T | { void foo(T); }) void bar(T x) {}
+forall(T... | { void foo(T); }) void bar(T x) {}
 void foo(int) {}
 
Index: tests/zombies/ArrayN.c
===================================================================
--- tests/zombies/ArrayN.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/ArrayN.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -6,5 +6,5 @@
 // }
 
-forall(otype index_t)
+forall(index_t)
 index_t offset_to_index(unsigned offset, index_t size) {
     return [offset / size.0, offset % size.1];
Index: tests/zombies/Members.c
===================================================================
--- tests/zombies/Members.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/Members.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -2,6 +2,6 @@
 int ?=?( int*, int );
 float ?=?( float*, float );
-forall( dtype DT ) DT * ?=?( DT**, DT* );
-forall(otype T) lvalue T *?( T* );
+forall( DT & ) DT * ?=?( DT**, DT* );
+forall(T) lvalue T *?( T* );
 char *__builtin_memcpy();
 
Index: tests/zombies/Rank2.c
===================================================================
--- tests/zombies/Rank2.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/Rank2.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,8 +1,8 @@
 int ?=?( int &, int );
-forall(dtype DT) DT * ?=?( DT *&, DT * );
+forall(DT &) DT * ?=?( DT *&, DT * );
 
 void a() {
-	forall( otype T ) void f( T );
-	void g( forall( otype U ) void p( U ) );
+	forall( T ) void f( T );
+	void g( forall( U ) void p( U ) );
 	g( f );
 }
@@ -10,5 +10,5 @@
 void g() {
 	void h( int *null );
-	forall( otype T ) T id( T );
+	forall( T ) T id( T );
 //	forall( dtype T ) T *0;
 //	int 0;
Index: tests/zombies/abstype.c
===================================================================
--- tests/zombies/abstype.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/abstype.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -21,8 +21,8 @@
 }
 
-forall( otype T ) T *?( T * );
+forall( T ) T *?( T * );
 int ?++( int * );
 int ?=?( int *, int );
-forall( dtype DT ) DT * ?=?( DT **, DT * );
+forall( DT & ) DT * ?=?( DT **, DT * );
 
 otype U = int *;
Index: tests/zombies/context.cfa
===================================================================
--- tests/zombies/context.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/context.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,10 +1,10 @@
 // trait declaration
 
-trait has_q( otype T ) {
+trait has_q( T ) {
 	T q( T );
 };
 
-forall( otype z | has_q( z ) ) void f() {
-	trait has_r( otype T, otype U ) {
+forall( z | has_q( z ) ) void f() {
+	trait has_r( T, U ) {
 		T r( T, T (T,U) );
 	};
Index: tests/zombies/gc_no_raii/bug-repro/blockers/explicit_cast.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/blockers/explicit_cast.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/blockers/explicit_cast.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -9,5 +9,5 @@
 };
 
-forall(otype T)
+forall(T)
 struct gcpointer
 {
@@ -15,5 +15,5 @@
 };
 
-forall(otype T)
+forall(T)
 static inline gcpointer(T) gcmalloc()
 {
Index: tests/zombies/gc_no_raii/bug-repro/blockers/recursive_realloc.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/blockers/recursive_realloc.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/blockers/recursive_realloc.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -3,10 +3,10 @@
 #include <stdlib.hfa>
 
-trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
 {
 	void realloc(allocator_t* const, size_t);
 };
 
-forall(otype T)
+forall(T)
 struct heap_allocator
 {
@@ -15,5 +15,5 @@
 };
 
-forall(otype T)
+forall(T)
 inline void realloc(heap_allocator(T) *const this, size_t size)
 {
Index: tests/zombies/gc_no_raii/bug-repro/deref.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/deref.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/deref.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,3 +1,3 @@
-    forall(otype T)
+    forall(T)
     struct wrap
     {
@@ -5,5 +5,5 @@
     };
 
-    forall(otype T)
+    forall(T)
     T *? (wrap(T) rhs)
     {
Index: tests/zombies/gc_no_raii/bug-repro/field.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/field.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/field.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -8,5 +8,5 @@
 //------------------------------------------------------------------------------
 //Declaration
-trait allocator_c(otype T, otype allocator_t)
+trait allocator_c(T, allocator_t)
 {
 	void ctor(allocator_t* const);
@@ -16,5 +16,5 @@
 };
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 struct vector
 {
Index: tests/zombies/gc_no_raii/bug-repro/malloc.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/malloc.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/malloc.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,3 +1,3 @@
-forall(otype T)
+forall(T)
 struct wrapper
 {
@@ -5,5 +5,5 @@
 };
 
-forall(otype T)
+forall(T)
 void ctor(wrapper(T)* this)
 {
@@ -11,5 +11,5 @@
 }
 
-forall(otype T)
+forall(T)
 wrapper(T) gcmalloc()
 {
@@ -19,5 +19,5 @@
 }
 
-forall(otype T)
+forall(T)
 wrapper(T)* ?=? (wrapper(T)* lhs, wrapper(T)* rhs)
 {
Index: tests/zombies/gc_no_raii/bug-repro/oddtype.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/oddtype.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/oddtype.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,10 +1,10 @@
-forall(dtype T)
+forall(T &)
 struct wrap {
 	int i;
 };
 
-forall(otype T) void ?{}(wrap(T)* this) {}
-forall(otype T) void ?=?(wrap(T)* this) {}
-forall(otype T) void ^?{}(wrap(T)* this) {}
+forall(T) void ?{}(wrap(T)* this) {}
+forall(T) void ?=?(wrap(T)* this) {}
+forall(T) void ^?{}(wrap(T)* this) {}
 
 struct List_t {
Index: tests/zombies/gc_no_raii/bug-repro/push_back.h
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/push_back.h	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/push_back.h	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,5 +1,5 @@
 //------------------------------------------------------------------------------
 //Declaration
-trait allocator_c(otype T, otype allocator_t) {
+trait allocator_c(T, allocator_t) {
 	void ctor(allocator_t* const);
 	void dtor(allocator_t* const);
@@ -8,5 +8,5 @@
 };
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 struct vector
 {
@@ -17,13 +17,13 @@
 //------------------------------------------------------------------------------
 //Initialization
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void vector_ctor(vector(T, allocator_t) *const this);
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void dtor(vector(T, allocator_t) *const this);
 
 //------------------------------------------------------------------------------
 //Allocator
-forall(otype T)
+forall(T)
 struct heap_allocator
 {
@@ -32,14 +32,14 @@
 };
 
-forall(otype T)
+forall(T)
 void ctor(heap_allocator(T) *const this);
 
-forall(otype T)
+forall(T)
 void dtor(heap_allocator(T) *const this);
 
-forall(otype T)
+forall(T)
 void realloc(heap_allocator(T) *const this, size_t size);
 
-forall(otype T)
+forall(T)
 inline T* data(heap_allocator(T) *const this)
 {
@@ -49,5 +49,5 @@
 //------------------------------------------------------------------------------
 //Capacity
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline bool empty(vector(T, allocator_t) *const this)
 {
@@ -55,5 +55,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline bool size(vector(T, allocator_t) *const this)
 {
@@ -61,5 +61,5 @@
 }
 
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 inline void reserve(vector(T, allocator_t) *const this, size_t size)
 {
@@ -69,4 +69,4 @@
 //------------------------------------------------------------------------------
 //Modifiers
-forall(otype T, otype allocator_t | allocator_c(T, allocator_t))
+forall(T, allocator_t | allocator_c(T, allocator_t))
 void push_back(vector(T, allocator_t) *const this, T value);
Index: tests/zombies/gc_no_raii/bug-repro/realloc.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/realloc.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/realloc.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,5 +1,5 @@
 void* realloc(void*, unsigned long int);
 
-forall(otype T)
+forall(T)
 struct wrap
 {
@@ -7,5 +7,5 @@
 };
 
-forall(otype T)
+forall(T)
 static inline void realloc(wrap(T) *const this, unsigned long int size)
 {
Index: tests/zombies/gc_no_raii/bug-repro/return.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/return.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/return.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,3 +1,3 @@
-forall(otype T)
+forall(T)
 struct wrapper
 {
@@ -5,5 +5,5 @@
 };
 
-forall(otype T)
+forall(T)
 wrapper(T) create()
 {
@@ -12,5 +12,5 @@
 }
 
-forall(otype T)
+forall(T)
 wrapper(T)* ?=?(wrapper(T)* lhs, wrapper(T)* rhs)
 {
Index: tests/zombies/gc_no_raii/bug-repro/return_template.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/return_template.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/return_template.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,3 +1,3 @@
-forall(otype T)
+forall(T)
 struct wrap
 {
@@ -5,10 +5,10 @@
 };
 
-forall(otype T) void ?{}(wrap(T)* this);
-forall(otype T) void ?{}(wrap(T)* this, wrap(T)* rhs);
-forall(otype T) void ^?{}(wrap(T)* this);
-forall(otype T) void ?=?(wrap(T)* this, wrap(T)* rhs);
+forall(T) void ?{}(wrap(T)* this);
+forall(T) void ?{}(wrap(T)* this, wrap(T)* rhs);
+forall(T) void ^?{}(wrap(T)* this);
+forall(T) void ?=?(wrap(T)* this, wrap(T)* rhs);
 
-forall(otype T)
+forall(T)
 wrap(T) test()
 {
Index: tests/zombies/gc_no_raii/bug-repro/slow_malloc.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/slow_malloc.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/slow_malloc.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,5 +1,5 @@
 #include <stdlib.hfa>
 
-forall(otype T)
+forall(T)
 struct heap_allocator
 {
Index: tests/zombies/gc_no_raii/bug-repro/zero.c
===================================================================
--- tests/zombies/gc_no_raii/bug-repro/zero.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/bug-repro/zero.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,3 +1,3 @@
-forall(otype T)
+forall(T)
 struct wrap
 {
@@ -5,5 +5,5 @@
 };
 
-forall(otype T)
+forall(T)
 int ?==? (wrap(T) lhs, wrap(T) rhs)
 {
@@ -14,5 +14,5 @@
 struct wrap(int) 0;
 /*/
-forall(otype T)
+forall(T)
 struct wrap(T) 0;
 //*/
Index: tests/zombies/gc_no_raii/src/gc.h
===================================================================
--- tests/zombies/gc_no_raii/src/gc.h	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/src/gc.h	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -13,5 +13,5 @@
 // }
 
-forall(otype T)
+forall(T)
 static inline void gcmalloc(gcpointer(T)* ptr)
 {
Index: tests/zombies/gc_no_raii/src/gcpointers.c
===================================================================
--- tests/zombies/gc_no_raii/src/gcpointers.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/src/gcpointers.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -113,21 +113,21 @@
 #endif
 
-forall(otype T) void ?{}(gcpointer(T)* this) {
+forall(T) void ?{}(gcpointer(T)* this) {
 	(&this->internal) {};
 }
 
-forall(otype T) void ?{}(gcpointer(T)* this, void* address) {
+forall(T) void ?{}(gcpointer(T)* this, void* address) {
 	(&this->internal) { address };
 }
 
-forall(otype T) void ?{}(gcpointer(T)* this, gcpointer(T) other) {
+forall(T) void ?{}(gcpointer(T)* this, gcpointer(T) other) {
 	(&this->internal) { other.internal };
 }
 
-forall(otype T) void ^?{}(gcpointer(T)* this) {
+forall(T) void ^?{}(gcpointer(T)* this) {
 	^?{}(&this->internal);
 }
 
-forall(otype T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs) {
+forall(T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs) {
 	this->internal = rhs.internal;
 	return *this;
@@ -136,10 +136,10 @@
 // forall(otype T) T *?(gcpointer(T) this);
 
-forall(otype T) T* get(gcpointer(T)* this) {
+forall(T) T* get(gcpointer(T)* this) {
 	return (T*)this->internal.ptr;
 }
 //
 // //Logical operators
-forall(otype T) int ?!=?(gcpointer(T) this, int zero) {
+forall(T) int ?!=?(gcpointer(T) this, int zero) {
 	return this.internal.ptr != 0;
 }
Index: tests/zombies/gc_no_raii/src/gcpointers.h
===================================================================
--- tests/zombies/gc_no_raii/src/gcpointers.h	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/src/gcpointers.h	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -4,5 +4,5 @@
 #include <stdint.h>
 
-forall(dtype T)
+forall(T &)
 struct gcpointer;
 
@@ -29,5 +29,5 @@
 #endif
 
-forall(dtype T)
+forall(T &)
 struct gcpointer
 {
@@ -36,16 +36,16 @@
 
 //
-forall(otype T) void ?{}(gcpointer(T)* this);
-forall(otype T) void ?{}(gcpointer(T)* this, void* address);
-forall(otype T) void ?{}(gcpointer(T)* this, gcpointer(T) other);
-forall(otype T) void ^?{}(gcpointer(T)* this);
-forall(otype T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs);
+forall(T) void ?{}(gcpointer(T)* this);
+forall(T) void ?{}(gcpointer(T)* this, void* address);
+forall(T) void ?{}(gcpointer(T)* this, gcpointer(T) other);
+forall(T) void ^?{}(gcpointer(T)* this);
+forall(T) gcpointer(T) ?=?(gcpointer(T)* this, gcpointer(T) rhs);
 
 
 // forall(otype T) T *?(gcpointer(T) this);
-forall(otype T) T* get(gcpointer(T)* this);
+forall(T) T* get(gcpointer(T)* this);
 
 //Logical operators
-forall(otype T) int ?!=?(gcpointer(T) this, int zero);
-forall(otype T) int ?!=?(gcpointer(T) this, gcpointer(T) rhs);
-forall(otype T) int ?==?(gcpointer(T) this, gcpointer(T) rhs);
+forall(T) int ?!=?(gcpointer(T) this, int zero);
+forall(T) int ?!=?(gcpointer(T) this, gcpointer(T) rhs);
+forall(T) int ?==?(gcpointer(T) this, gcpointer(T) rhs);
Index: tests/zombies/gc_no_raii/src/tools.h
===================================================================
--- tests/zombies/gc_no_raii/src/tools.h	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/gc_no_raii/src/tools.h	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -12,10 +12,10 @@
 // }
 
-trait has_equal(otype T)
+trait has_equal(T)
 {
 	signed int ?==?(T a, T b);
 };
 
-trait InputIterator_t(otype T, otype InputIterator)
+trait InputIterator_t(T, InputIterator)
 {
 	signed int ?==?(InputIterator a, InputIterator b);
@@ -26,5 +26,5 @@
 };
 
-forall(otype T | has_equal(T), otype InputIterator | InputIterator_t(T, InputIterator))
+forall(T | has_equal(T), InputIterator | InputIterator_t(T, InputIterator))
 static inline InputIterator find( InputIterator first, const InputIterator* const last, T val)
 {
Index: tests/zombies/hashtable.cfa
===================================================================
--- tests/zombies/hashtable.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/hashtable.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -14,14 +14,14 @@
 
 
-trait has_hash( otype K ) {
+trait has_hash( K ) {
     size_t hash(K);
     int ?==?( K, K );
 };
 
-trait hkey( otype K, dtype tN | has_hash(K) ) {
+trait hkey( K, tN & | has_hash(K) ) {
     K key(tN &);
 };
 
-forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) ) {
 
     struct hashtable {
@@ -39,5 +39,5 @@
 }
 
-forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
 
     void ?{}( hashtable(K, tN, tE) & this, size_t n_buckets, dlist(tN, tE) *buckets ) {
@@ -57,5 +57,5 @@
 }
 
-forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) ) {
 
     float fill_frac( hashtable(K, tN, tE) & this ) with(this) {
@@ -124,5 +124,5 @@
 
 
-trait heaped(dtype T) {
+trait heaped(T &) {
     T * alloc( size_t );
     void free( void * ); 
@@ -133,5 +133,5 @@
 }
 
-forall( otype K, dtype tN, dtype tE | $dlistable(tN, tE) | hkey(K, tN) | heaped( dlist(tN, tE) ) ) {
+forall( K, tN &, tE & | $dlistable(tN, tE) | hkey(K, tN) | heaped( dlist(tN, tE) ) ) {
 
     struct hashtable_dynamic { 
Index: tests/zombies/hashtable2.cfa
===================================================================
--- tests/zombies/hashtable2.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/hashtable2.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -69,9 +69,9 @@
 
 
-trait pretendsToMatter( dtype TTT ) {
+trait pretendsToMatter( TTT & ) {
     void actsmart(TTT &);
 };
 
-forall( dtype TTTx )
+forall( TTTx & )
 void actsmart(TTTx &) {}
 
@@ -86,5 +86,5 @@
 //   2. shows up in -CFA output as hashtable_rbs(), which is bad C; expecting hashtable_rbs*
 
-forall( otype Tt_unused | pretendsToMatter(Tt_unused) ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) ) {
 
     // hashtable of request by source
@@ -104,5 +104,5 @@
 }
 
-forall( otype Tt_unused | pretendsToMatter(Tt_unused) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) | { void defaultResumptionHandler(ht_fill_limit_crossed &); } ) {
 
     void ?{}( hashtable_rbs(Tt_unused) & this, size_t n_buckets, dlist(request_in_ht_by_src, request) *buckets,
@@ -135,5 +135,5 @@
 void defaultResumptionHandler( ht_auto_resize_pending & ex );
 
-forall( otype Tt_unused | pretendsToMatter(Tt_unused) ) {
+forall( Tt_unused | pretendsToMatter(Tt_unused) ) {
 
     float fill_frac( hashtable_rbs(Tt_unused) & this ) with(this) {
@@ -221,5 +221,5 @@
 
 
-trait heaped(dtype T) {
+trait heaped(T &) {
     T * alloc( size_t );
     void free( void * ); 
@@ -228,5 +228,5 @@
 void __dynamic_defaultResumptionHandler(ht_fill_limit_crossed &);
 
-forall( otype Tt_unused ) {
+forall( Tt_unused ) {
 
     struct hashtable_rbs_dynamic { 
@@ -263,5 +263,5 @@
 
 
-forall( otype Tt_unused | heaped( dlist(request_in_ht_by_src, request) ) ) {
+forall( Tt_unused | heaped( dlist(request_in_ht_by_src, request) ) ) {
 
     void ?{}( hashtable_rbs_dynamic(Tt_unused).resize_policy & this, size_t nbuckets_floor ) {
@@ -325,5 +325,5 @@
 }
 
-forall( otype Tt_unused ) {
+forall( Tt_unused ) {
     void rehashToLarger_STEP( hashtable_rbs_dynamic(Tt_unused) & this, size_t new_n_buckets ) with (this) {
         rehashToLarger( this, new_n_buckets );
Index: tests/zombies/huge.c
===================================================================
--- tests/zombies/huge.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/huge.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -14,5 +14,5 @@
 //
 
-int huge( int n, forall( otype T ) T (*f)( T ) ) {
+int huge( int n, forall( T ) T (*f)( T ) ) {
 	if ( n <= 0 )
 		return f( 0 );
Index: tests/zombies/it_out.c
===================================================================
--- tests/zombies/it_out.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/it_out.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,18 +16,18 @@
 typedef unsigned long streamsize_type;
 
-trait ostream( dtype os_type ) {
+trait ostream( os_type & ) {
 	os_type *write( os_type *, const char *, streamsize_type );
 	int fail( os_type * );
 };
 
-trait writeable( otype T ) {
-	forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, T );
+trait writeable( T ) {
+	forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, T );
 };
 
-forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, char );
-forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, int );
-forall( dtype os_type | ostream( os_type ) ) os_type * ?<<?( os_type *, const char * );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, char );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, int );
+forall( os_type & | ostream( os_type ) ) os_type * ?<<?( os_type *, const char * );
 
-trait istream( dtype is_type ) {
+trait istream( is_type & ) {
 	is_type *read( is_type *, char *, streamsize_type );
 	is_type *unread( is_type *, char );
@@ -36,12 +36,12 @@
 };
 
-trait readable( otype T ) {
-	forall( dtype is_type | istream( is_type ) ) is_type * ?<<?( is_type *, T );
+trait readable( T ) {
+	forall( is_type & | istream( is_type ) ) is_type * ?<<?( is_type *, T );
 };
 
-forall( dtype is_type | istream( is_type ) ) is_type * ?>>?( is_type *, char* );
-forall( dtype is_type | istream( is_type ) ) is_type * ?>>?( is_type *, int* );
+forall( is_type & | istream( is_type ) ) is_type * ?>>?( is_type *, char* );
+forall( is_type & | istream( is_type ) ) is_type * ?>>?( is_type *, int* );
 
-trait iterator( otype iterator_type, otype elt_type ) {
+trait iterator( iterator_type, elt_type ) {
 	iterator_type ?++( iterator_type* );
 	iterator_type ++?( iterator_type* );
@@ -52,12 +52,12 @@
 };
 
-forall( otype elt_type | writeable( elt_type ),
-		otype iterator_type | iterator( iterator_type, elt_type ),
-		dtype os_type | ostream( os_type ) )
+forall( elt_type | writeable( elt_type ),
+		iterator_type | iterator( iterator_type, elt_type ),
+		os_type & | ostream( os_type ) )
 void write_all( iterator_type begin, iterator_type end, os_type *os );
 
-forall( otype elt_type | writeable( elt_type ),
-		otype iterator_type | iterator( iterator_type, elt_type ),
-		dtype os_type | ostream( os_type ) )
+forall( elt_type | writeable( elt_type ),
+		iterator_type | iterator( iterator_type, elt_type ),
+		os_type & | ostream( os_type ) )
 void write_all( elt_type begin, iterator_type end, os_type *os ) {
 	os << begin;
Index: tests/zombies/new.c
===================================================================
--- tests/zombies/new.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/new.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -14,5 +14,5 @@
 //
 
-forall( otype T )
+forall( T )
 void f( T *t ) {
 	t--;
Index: tests/zombies/occursError.cfa
===================================================================
--- tests/zombies/occursError.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/occursError.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,5 +1,5 @@
-forall( otype T ) void f( void (*)( T, T * ) );
-forall( otype U ) void g( U,  U * );
-forall( otype U ) void h( U *, U );
+forall( T ) void f( void (*)( T, T * ) );
+forall( U ) void g( U,  U * );
+forall( U ) void h( U *, U );
 
 void test() {
Index: tests/zombies/prolog.c
===================================================================
--- tests/zombies/prolog.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/prolog.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -25,13 +25,13 @@
 void is_integer( int x ) {}
 
-trait ArithmeticType( otype T ) {
+trait ArithmeticType( T ) {
 	void is_arithmetic( T );
 };
 
-trait IntegralType( otype T | ArithmeticType( T ) ) {
+trait IntegralType( T | ArithmeticType( T ) ) {
 	void is_integer( T );
 };
 
-forall( otype T | IntegralType( T ) | { void printResult( T ); } )
+forall( T | IntegralType( T ) | { void printResult( T ); } )
 void hornclause( T param ) {
 	printResult( param );
Index: tests/zombies/quad.c
===================================================================
--- tests/zombies/quad.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/quad.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,10 +16,10 @@
 #include <fstream.hfa>
 
-forall( otype T | { T ?*?( T, T ); } )
+forall( T | { T ?*?( T, T ); } )
 T square( T t ) {
 	return t * t;
 }
 
-forall( otype U | { U square( U ); } )
+forall( U | { U square( U ); } )
 U quad( U u ) {
 	return square( square( u ) );
Index: tests/zombies/scope.cfa
===================================================================
--- tests/zombies/scope.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/scope.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -20,9 +20,9 @@
 y p;
 
-trait has_u( otype z ) {
+trait has_u( z ) {
 	z u(z);
 };
 
-forall( otype t | has_u( t ) )
+forall( t | has_u( t ) )
 y q( t the_t ) {
 	t y = u( the_t );
Index: tests/zombies/simplePoly.c
===================================================================
--- tests/zombies/simplePoly.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/simplePoly.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -14,5 +14,5 @@
 //
 
-forall( otype T, otype U | { T f( T, U ); } )
+forall( T, U | { T f( T, U ); } )
 T q( T t, U u ) {
 	return f( t, u );
Index: tests/zombies/simpler.c
===================================================================
--- tests/zombies/simpler.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/simpler.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -14,5 +14,5 @@
 //
 
-forall( otype T ) T id( T, T );
+forall( T ) T id( T, T );
 
 int main() {
Index: tests/zombies/specialize.c
===================================================================
--- tests/zombies/specialize.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/specialize.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -39,5 +39,5 @@
 }
 
-forall( otype T ) T f( T t )
+forall( T ) T f( T t )
 {
 	printf( "in f; sizeof T is %d\n", sizeof( T ) );
Index: tests/zombies/square.c
===================================================================
--- tests/zombies/square.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/square.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,5 +16,5 @@
 #include <fstream.hfa>
 
-forall( otype T | { T ?*?( T, T ); } )
+forall( T | { T ?*?( T, T ); } )
 T square( T t ) {
 	return t * t;
Index: tests/zombies/structMember.cfa
===================================================================
--- tests/zombies/structMember.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/structMember.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -66,5 +66,5 @@
 	S.T;
 	.S.T;
-	forall( otype S, otype T ) struct W {
+	forall( S, T ) struct W {
 		struct X {};
 	};
Index: tests/zombies/subrange.cfa
===================================================================
--- tests/zombies/subrange.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/subrange.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,5 +1,5 @@
 // A small context defining the notion of an ordered otype.  (The standard
 // library should probably contain a context for this purpose.)
-trait ordered(otype T) {
+trait ordered(T) {
     int ?<?(T, T), ?<=?(T, T);
 };
@@ -7,5 +7,5 @@
 // A subrange otype resembling an Ada subotype with a base otype and a range
 // constraint.
-otype subrange(otype base_t | ordered(base_t), base_t low = 0, base_t high = 8) = base_t;
+otype subrange(base_t | ordered(base_t), base_t low = 0, base_t high = 8) = base_t;
 
 // Note that subrange() can be applied to floating-point and pointer otypes, not
@@ -28,10 +28,10 @@
 
 // Convenient access to subrange bounds, for instance for iteration:
-forall (otype T, T low, T high)
+forall (T, T low, T high)
 T lbound( subrange(T, low, high) v) {
     return low;
 }
 
-forall (otype T, T low, T high)
+forall (T, T low, T high)
 T hbound( subrange(T, low, high) v) {
     return high;
@@ -44,5 +44,5 @@
 // of exception handling here.  Inlining allows the compiler to eliminate
 // bounds checks.
-forall (otype T | ordered(T), T low, T high)
+forall (T | ordered(T), T low, T high)
 inline subrange(T, low, high) ?=?(subrange(T, low, high)* target, T source) {
     if (low <= source && source <= high) *((T*)target) = source;
@@ -54,5 +54,5 @@
 // compares range bounds so that the compiler can optimize checks away when the
 // ranges are known to overlap.
-forall (otype T | ordered(T), T t_low, T t_high, T s_low, T s_high)
+forall (T | ordered(T), T t_low, T t_high, T s_low, T s_high)
 inline subrange(T, t_low, t_high) ?=?(subrange(T, t_low, t_high)* target,
 				      subrange(T, s_low, s_high) source) {
Index: tests/zombies/twice.c
===================================================================
--- tests/zombies/twice.c	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/twice.c	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -16,5 +16,5 @@
 #include <fstream.hfa>
 
-forall( otype T | { T ?+?( T, T ); } )
+forall( T | { T ?+?( T, T ); } )
 T twice( const T t ) {
 	return t + t;
Index: tests/zombies/typeGenerator.cfa
===================================================================
--- tests/zombies/typeGenerator.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/typeGenerator.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -1,8 +1,8 @@
-context addable( otype T ) {
+context addable( T ) {
 	T ?+?( T,T );
 	T ?=?( T*, T);
 };
 
-otype List1( otype T | addable( T ) ) = struct { T data; List1( T ) *next; } *;
+otype List1( T | addable( T ) ) = struct { T data; List1( T ) *next; } *;
 typedef List1( int ) ListOfIntegers;
 //List1( int ) li;
@@ -11,11 +11,11 @@
 [int] h( * List1( int ) p );							// new declaration syntax
 
-struct( otype T ) S2 { T i; };							// actual definition
+struct( T ) S2 { T i; };							// actual definition
 struct( int ) S3 v1, *p;								// expansion and instantiation
-struct( otype T )( int ) S24 { T i; } v2;				// actual definition, expansion and instantiation
-struct( otype T )( int ) { T i; } v2;					// anonymous actual definition, expansion and instantiation
+struct( T )( int ) S24 { T i; } v2;				// actual definition, expansion and instantiation
+struct( T )( int ) { T i; } v2;					// anonymous actual definition, expansion and instantiation
 
-struct( otype T | addable( T ) ) node { T data; struct( T ) node *next; };
-otype List( otype T ) = struct( T ) node *;
+struct( T | addable( T ) ) node { T data; struct( T ) node *next; };
+otype List( T ) = struct( T ) node *;
 List( int ) my_list;
 
Index: tests/zombies/withStatement.cfa
===================================================================
--- tests/zombies/withStatement.cfa	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/withStatement.cfa	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -54,10 +54,10 @@
 }
 
-forall( otype T )
+forall( T )
 struct Box {
 	T x;
 };
 
-forall( otype T )
+forall( T )
 void ?{}( Box(T) & this ) with( this ) { // with clause in polymorphic function
 	x{};
@@ -66,5 +66,5 @@
 void print( int i ) { sout | i; }
 
-forall( otype T | { void print( T ); })
+forall( T | { void print( T ); })
 void foo( T t ) {
 	Box( T ) b = { t };
Index: tests/zombies/wrapper/src/pointer.h
===================================================================
--- tests/zombies/wrapper/src/pointer.h	(revision 5869cea3d27c1f6c01a80f191c1c078f5385adcb)
+++ tests/zombies/wrapper/src/pointer.h	(revision 7b91c0e992ed493cc46d297ec3fe313c381a8dbc)
@@ -8,5 +8,5 @@
 // type safe malloc / free
 
-forall(otype T)
+forall(T)
 T* new()
 {
@@ -16,5 +16,5 @@
 }
 
-forall(otype T)
+forall(T)
 void delete(T* p)
 {
